diff --git a/CHANGELOG.md b/CHANGELOG.md index 62b06167077..dd1c4f9eac7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,11 @@ # Changelog -## v20.07: (Upcoming Release) +## v20.10: (Upcoming Release) -### nvmf +### bdev -The NVMe-oF target no longer supports connecting scheduling configuration and instead -always uses what was previously called "transport" scheduling. +A new API was added `bdev_examine_bdev` that allows users to examine a +bdev explicitly. It can be used only if auto_examine is disabled. `spdk_nvmf_tgt_accept` no longer exists. The accept process now occurs automatically after the creation of an nvmf target and queue pairs are assigned to poll groups by the underlying @@ -13,21 +13,28 @@ transport. Further, `spdk_nvmf_transport_ops` has changed such that the accept f pointer no longer takes a function pointer as an argument. Instead, transports should call `spdk_nvmf_tgt_new_qpair` whenever they previously would have called that callback. -### nvme +Updated DPDK submodule to DPDK 20.08. -Add `opts_size` in `spdk_nvme_ctrlr_opts` structure in order to solve the compatiblity issue -for different ABI version. +### isa-l -### RPC +Updated ISA-L submodule to v2.29.0. -Command line parameters `-r` and `--rpc-socket` will longer accept TCP ports. RPC server -must now be started on a Unix domain socket. Exposing RPC on the network, as well as providing -proper authentication (if needed) is now a responsibility of the user. +### ocf -### build +Updated OCF submodule to v20.03.1 -The fio plugins now compile to `build/fio` and are named `spdk_bdev` and `spdk_nvme`. -Existing fio configuration files will need to be updated. +### sock + +The `enable_placement_id` field was added in the struct spdk_sock_impl_opts to +make the placement_id feature configurable by users. The default setting is +not enabled. + +### rpc + +A new optional parameter `enable_placement_id` was added to the `sock_impl_set_options` +RPC. + +## v20.07: SPDK CSI driver, new accel_fw commands, I/O abort support ### accel @@ -42,15 +49,87 @@ A new capability, compare, was added via `spdk_accel_submit_compare`. The software accel engine implemenation has added support for compare. +Several APIs were added to `accel_engine.h` to support batched submission +of operations. + +Several APIs were added to `accel_engine.h` to support dualcast operations. + +### accel_fw + +The accel_fw was updated to no longer require the app to allocate an +accel_task on its behalf. All public APIs now take a callback arg as +the parameter that used to be the accel_task. + +The accel_fw API `spdk_accel_task_size` was removed as it is no longer +required. + +The accel_fw was updated to support compare, dualcast, crc32c. + +The accel_fw introduced batching support for all commands in all plug-ins. +See docs for detailed information. + +### bdev + +A new API `spdk_bdev_abort` has been added to submit abort requests to abort all I/Os +whose callback context match to the bdev on the given channel. + +### build + +The fio plugins now compile to `build/fio` and are named `spdk_bdev` and `spdk_nvme`. +Existing fio configuration files will need to be updated. + ### dpdk -Updated DPDK submodule to DPDK 19.11.2, which includes fixes for DPDK vulnerabilities: -CVE-2020-10722, CVE-2020-10723, CVE-2020-10724, CVE-2020-10725, CVE-2020-10724. +Updated DPDK submodule to DPDK 20.05. + +### env + +Several new APIs have been added to provide greater flexibility in registering and +accessing polled mode PCI drivers. See `env.h` for more details. ### idxd +The idxd library and plug-in module for the accel_fw were updated to support +all accel_fw commands as well as batching. Batching is supported both +through the library and the plug-in module. + IDXD engine support for CRC-32C has been added. +### ioat + +A new API `spdk_ioat_get_max_descriptors` was added. + +### nvme + +An `opts_size`element was added in the `spdk_nvme_ctrlr_opts` structure +to solve the ABI compatiblity issue between different SPDK version. + +A new API `spdk_nvme_ctrlr_cmd_abort_ext` has been added to abort previously submitted +commands whose callback argument match. + +Convenience functions, `spdk_nvme_print_command` and `spdk_nvme-print_completion` were added +to the public API. + +A new function, `spdk_nvmf_cuse_update_namespaces`, updates the cuse representation of an NVMe +controller. + +A new function `qpair_iterate_requests` has been added to the nvme transport interface. ALl +implementations of the transport interface will have to implement that function. + +### nvmf + +The NVMe-oF target no longer supports connecting scheduling configuration and instead +always uses what was previously called "transport" scheduling. + +`spdk_nvmf_tgt_accept` no longer takes a function pointer as an argument. New connections +are automatically assigned to poll groups by the underlying transport. Further, +`spdk_nvmf_transport_ops` has changed such that the accept function pointer no longer +takes a function pointer as an argument. Instead, transports should call +`spdk_nvmf_tgt_new_qpair` whenever they previously would have called that callback. + +The NVMe-oF target now supports aborting any submitted NVM or Admin command. Previously, +the NVMe-oF target could abort only Asynchronous Event Request commands. + ### rdma A new `rdma` library has been added. It is an abstraction layer over different RDMA providers. @@ -61,10 +140,19 @@ Using mlx5_dv requires libmlx5 installed on the system. ### rpc Parameter `-p` or `--max-qpairs-per-ctrlr` of `nvmf_create_transport` RPC command accepted by the -rpc.py script is deprecated, new parameter `-m` or `--max-io-qpairs-per-ctrlr` is added. +rpc.py script is deprecated, new parameter `-m` or `--max-io-qpairs-per-ctrlr` was added. -Parameter `max_qpairs_per_ctrlr` of `nvmf_create_transport` RPC command accepted by the NVMF target -is deprecated, new parameter `max_io_qpairs_per_ctrlr` is added. +Added `sock_impl_get_options` and `sock_impl_set_options` RPC methods. + +Command line parameters `-r` and `--rpc-socket` will longer accept TCP ports. RPC server +must now be started on a Unix domain socket. Exposing RPC on the network, as well as providing +proper authentication (if needed) is now a responsibility of the user. + +The `bdev_set_options` RPC has a new option, `bdev_auto_examine` to control the auto examine function +of bdev modules. + +New RPCs `sock_impl_get_options` and `sock_impl_set_options` been added to expose new socket features. +See `sock` section for more details. ### sock @@ -73,11 +161,26 @@ options. Options can be set independently for each implementation. Added `recv_buf_size` and 'send_buf_size' socket layer options. They are used only in posix implementation. +Added `uring` based socket implementation, the code is located in module/sock/uring. This feature is only +available in Linux which requires kernel version is greater than 5.4.3. Currently, our CI pool added the uring +based socket tests for iSCSI target and also the tests for SPDK NVMe-oF tcp transport. + +Added `enable_recv_pipe` socket layer option to allow disabling of double buffering on receive. +New option is used only in posix implementation. + +Added `enable_zerocopy_send` socket layer option to allow disabling of zero copy flow on send. +New option is used only in posix implementation. + +### util + +Some previously exposed CRC32 functions have been removed from the public API - +`spdk_crc32_update`, `spdk_crc32_table_init`, and the `spdk_crc32_table` struct. + ### vhost The function `spdk_vhost_blk_get_dev` has been removed. -## v20.04: +## v20.04: SPDK Top, IDXD, NVMe qpair groups IDXD engine support for compare has been added. @@ -280,7 +383,7 @@ Poll groups per session have been replaced by SPDK threads per vhost controller. A new function, `spdk_vmd_fini`, has been added. It releases all resources acquired by the VMD library through the `spdk_vmd_init` call. -## v20.01 +## v20.01: Optimized thin provisioning, FTL bdev, VMD hot plug, FUSED support ### bdev @@ -470,7 +573,7 @@ code for fused compare-and-write operation. Added spdk_bdev_get_acwu function for getting block device atomic compare and write unit size. -## v19.10 +## v19.10: Zoned bdev API, Opal bdev, NVMe character devices ### rpc @@ -700,7 +803,7 @@ New cache modes added to use via RPC, wi - write invalidate and wa - write aroun New version of OCF provides fully asynchronous management API. -## v19.07 +## v19.07: NVMe-oF FC Transport, VMD, NVMe-oF Persistent reservations, Bdev I/O with separate metadata ### ftl @@ -976,7 +1079,7 @@ with SPDK thread when necessary. Added spdk_thread_destroy() to allow framework polling the thread to release resources associated with that thread. -## v19.04 +## v19.04: Compression bdev, Notification library, NVMe Opal support ### nvme @@ -1197,7 +1300,7 @@ Added "reduce" block compression scheme based on using SSDs for storing compressed blocks of storage and presistent memory for metadata. Please see [compression](https://spdk.io/doc/bdev.html) for more details. -## v19.01 +## v19.01: NVMe-oF TCP/IP Transport, Open Channel SSD Flash Translation Layer, Caching bdev based on OCF, ISA-L Support, DIF/DIX library ### ocf bdev @@ -1452,7 +1555,7 @@ JSON RPC client is now running in non-blocking mode. Requests are sent and recei JSON RPC server can now recieve a callback on connection termination or server shutdown using `spdk_jsonrpc_conn_add_close_cb` and `spdk_jsonrpc_conn_del_close_cb`. -## v18.10 +## v18.10: Dynamic memory allocation, Crypto Virtual Bdev, jsonrpc-client, SPDKCLI iSCSI and NVMe-oF support ### nvme @@ -1649,7 +1752,7 @@ in QEMU. The SPDKCLI interactive command tool for managing SPDK is no longer considered experimental. Support for the iSCSI and NVMe-oF targets has been added. -## v18.07 +## v18.07: Raid, Infrastructure Improvements, Bug Fixes ### bdev diff --git a/CONFIG b/CONFIG index d947909e9a9..569e53bf74b 100644 --- a/CONFIG +++ b/CONFIG @@ -125,10 +125,6 @@ CONFIG_PMDK_DIR= # Enable the dependencies for building the compress vbdev CONFIG_REDUCE=n -# Build with VPP -CONFIG_VPP=n -CONFIG_VPP_DIR= - # Requires libiscsi development libraries. CONFIG_ISCSI_INITIATOR=n diff --git a/Makefile b/Makefile index 5cef1c0a424..e77725e87fc 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ # BSD LICENSE # # Copyright (c) Intel Corporation. +# Copyright (c) 2020, Mellanox Corporation. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -49,6 +50,9 @@ DIRS-$(CONFIG_ISAL) += isalbuild cc_version cxx_version .libs_only_other .ldflags ldflags install \ uninstall +# Workaround for ninja. See dpdkbuild/Makefile +export MAKE_PID := $(shell echo $$PPID) + ifeq ($(SPDK_ROOT_DIR)/lib/env_dpdk,$(CONFIG_ENV)) ifeq ($(CURDIR)/dpdk/build,$(CONFIG_DPDK_DIR)) ifneq ($(SKIP_DPDK_BUILD),1) @@ -81,6 +85,7 @@ clean: $(DIRS-y) $(Q)rm -rf build/fio $(Q)rm -rf build/examples $(Q)rm -rf build/include + $(Q)find build/lib ! -name .gitignore -type f -delete install: all $(Q)echo "Installed to $(DESTDIR)$(CONFIG_PREFIX)" diff --git a/app/Makefile b/app/Makefile index 5a60f669a75..8ff318ddf3d 100644 --- a/app/Makefile +++ b/app/Makefile @@ -44,6 +44,7 @@ DIRS-y += spdk_lspci DIRS-y += spdk_top ifeq ($(OS),Linux) DIRS-$(CONFIG_VHOST) += vhost +DIRS-y += spdk_dd endif .PHONY: all clean $(DIRS-y) diff --git a/app/iscsi_tgt/Makefile b/app/iscsi_tgt/Makefile index 24af5bb9ae9..6b695d91aa5 100644 --- a/app/iscsi_tgt/Makefile +++ b/app/iscsi_tgt/Makefile @@ -44,7 +44,7 @@ CFLAGS += -I$(SPDK_ROOT_DIR)/lib C_SRCS := iscsi_tgt.c SPDK_LIB_LIST = $(ALL_MODULES_LIST) -SPDK_LIB_LIST += event_bdev event_accel event_iscsi event_net event_scsi event_vmd event +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) event_iscsi event_net event_scsi event SPDK_LIB_LIST += jsonrpc json rpc bdev_rpc bdev iscsi scsi accel trace conf SPDK_LIB_LIST += thread util log log_rpc app_rpc net sock notify diff --git a/app/iscsi_top/Makefile b/app/iscsi_top/Makefile index 0d66fcfb28b..86fd73edf35 100644 --- a/app/iscsi_top/Makefile +++ b/app/iscsi_top/Makefile @@ -36,8 +36,8 @@ include $(SPDK_ROOT_DIR)/mk/spdk.common.mk include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk APP = iscsi_top +SPDK_NO_LINK_ENV = 1 -CXXFLAGS += $(ENV_CXXFLAGS) CXXFLAGS += -I$(SPDK_ROOT_DIR)/lib CXX_SRCS := iscsi_top.cpp diff --git a/app/nvmf_tgt/Makefile b/app/nvmf_tgt/Makefile index 67be0c87f2c..37d71cd88a7 100644 --- a/app/nvmf_tgt/Makefile +++ b/app/nvmf_tgt/Makefile @@ -40,7 +40,7 @@ APP = nvmf_tgt C_SRCS := nvmf_main.c SPDK_LIB_LIST = $(ALL_MODULES_LIST) -SPDK_LIB_LIST += event_bdev event_accel event_nvmf event_net event_vmd +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) event_nvmf event_net SPDK_LIB_LIST += nvmf event log trace conf thread util bdev accel rpc jsonrpc json net sock SPDK_LIB_LIST += app_rpc log_rpc bdev_rpc notify diff --git a/app/spdk_dd/.gitignore b/app/spdk_dd/.gitignore new file mode 100644 index 00000000000..8810437a913 --- /dev/null +++ b/app/spdk_dd/.gitignore @@ -0,0 +1 @@ +spdk_dd diff --git a/app/spdk_dd/Makefile b/app/spdk_dd/Makefile new file mode 100644 index 00000000000..3bd99f639f8 --- /dev/null +++ b/app/spdk_dd/Makefile @@ -0,0 +1,47 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk + +APP = spdk_dd + +C_SRCS := spdk_dd.c + +SPDK_LIB_LIST = $(ALL_MODULES_LIST) +SPDK_LIB_LIST += event_sock event_bdev event_accel event_vmd +SPDK_LIB_LIST += bdev accel event thread util conf trace \ + log jsonrpc json rpc sock notify + +include $(SPDK_ROOT_DIR)/mk/spdk.app.mk diff --git a/app/spdk_dd/spdk_dd.c b/app/spdk_dd/spdk_dd.c new file mode 100644 index 00000000000..c9f2dd11780 --- /dev/null +++ b/app/spdk_dd/spdk_dd.c @@ -0,0 +1,1174 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/event.h" +#include "spdk/fd.h" +#include "spdk/string.h" +#include "spdk/vmd.h" + +#include + +#ifdef SPDK_CONFIG_URING +#include +#endif + +#define DD_NSEC_SINCE_X(time_now, time_x) ((1000000000 * time_now.tv_sec + time_now.tv_nsec) \ + - (1000000000 * time_x.tv_sec + time_x.tv_nsec)) + +struct spdk_dd_opts { + char *input_file; + char *output_file; + char *input_file_flags; + char *output_file_flags; + char *input_bdev; + char *output_bdev; + uint64_t input_offset; + uint64_t output_offset; + int64_t io_unit_size; + int64_t io_unit_count; + uint32_t queue_depth; + bool aio; +}; + +static struct spdk_dd_opts g_opts = { + .io_unit_size = 4096, + .queue_depth = 2, +}; + +enum dd_submit_type { + DD_POPULATE, + DD_READ, + DD_WRITE, +}; + +struct dd_io { + uint64_t offset; + uint64_t length; + struct iocb iocb; + enum dd_submit_type type; +#ifdef SPDK_CONFIG_URING + struct iovec iov; +#endif + void *buf; +}; + +enum dd_target_type { + DD_TARGET_TYPE_FILE, + DD_TARGET_TYPE_BDEV, +}; + +struct dd_target { + enum dd_target_type type; + + union { + struct { + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc; + struct spdk_io_channel *ch; + } bdev; + +#ifdef SPDK_CONFIG_URING + struct { + int fd; + struct io_uring ring; + struct spdk_poller *poller; + } uring; +#endif + struct { + int fd; + io_context_t io_ctx; + struct spdk_poller *poller; + } aio; + } u; + + /* Block size of underlying device. */ + uint32_t block_size; + + /* Position of next I/O in bytes */ + uint64_t pos; + + /* Total size of target in bytes */ + uint64_t total_size; + + bool open; +}; + +struct dd_job { + struct dd_target input; + struct dd_target output; + + struct dd_io *ios; + + uint32_t outstanding; + uint64_t copy_size; +}; + +struct dd_flags { + char *name; + int flag; +}; + +static struct dd_flags g_flags[] = { + {"append", O_APPEND}, + {"direct", O_DIRECT}, + {"directory", O_DIRECTORY}, + {"dsync", O_DSYNC}, + {"noatime", O_NOATIME}, + {"noctty", O_NOCTTY}, + {"nofollow", O_NOFOLLOW}, + {"nonblock", O_NONBLOCK}, + {"sync", O_SYNC}, + {NULL, 0} +}; + +static struct dd_job g_job = {}; +static int g_error = 0; +static struct timespec g_start_time; +static bool g_interrupt; + +static void dd_target_populate_buffer(struct dd_io *io); + +static void +dd_exit(int rc) +{ + if (g_job.input.type == DD_TARGET_TYPE_FILE) { +#ifdef SPDK_CONFIG_URING + if (g_opts.aio == false) { + spdk_poller_unregister(&g_job.input.u.uring.poller); + close(g_job.input.u.uring.fd); + } else +#endif + { + spdk_poller_unregister(&g_job.input.u.aio.poller); + io_destroy(g_job.input.u.aio.io_ctx); + close(g_job.input.u.aio.fd); + } + } else if (g_job.input.type == DD_TARGET_TYPE_BDEV && g_job.input.open) { + spdk_put_io_channel(g_job.input.u.bdev.ch); + spdk_bdev_close(g_job.input.u.bdev.desc); + } + + if (g_job.output.type == DD_TARGET_TYPE_FILE) { +#ifdef SPDK_CONFIG_URING + if (g_opts.aio == false) { + spdk_poller_unregister(&g_job.output.u.uring.poller); + close(g_job.output.u.uring.fd); + } else +#endif + { + spdk_poller_unregister(&g_job.output.u.aio.poller); + io_destroy(g_job.output.u.aio.io_ctx); + close(g_job.output.u.aio.fd); + } + } else if (g_job.output.type == DD_TARGET_TYPE_BDEV && g_job.output.open) { + spdk_put_io_channel(g_job.output.u.bdev.ch); + spdk_bdev_close(g_job.output.u.bdev.desc); + } + + spdk_app_stop(rc); +} + +static void +dd_show_progress(uint64_t offset, uint64_t length, bool finish) +{ + char *unit_str[5] = {"", "k", "M", "G", "T"}; + char *speed_type_str[2] = {"", "average "}; + char *size_unit_str = ""; + char *speed_unit_str = ""; + char *speed_type = ""; + uint64_t size = g_job.copy_size; + uint64_t size_unit = 1; + uint64_t speed_unit = 1; + uint64_t speed, tmp_speed; + static struct timespec g_time_last = {.tv_nsec = 0}; + static uint64_t g_data_last = 0; + struct timespec time_now; + int i = 0; + + clock_gettime(CLOCK_REALTIME, &time_now); + + if (((time_now.tv_sec == g_time_last.tv_sec && offset + length != g_job.copy_size) || + (offset < g_data_last)) && !finish) { + /* refresh every one second */ + return; + } + + /* Find the rigth unit for size displaying (B vs kB vs MB vs GB vs TB) */ + while (size > 1024 * 10) { + size >>= 10; + size_unit <<= 10; + size_unit_str = unit_str[++i]; + if (i == 4) { + break; + } + } + + if (!finish) { + speed_type = speed_type_str[0]; + tmp_speed = speed = (offset - g_data_last) * 1000000000 / DD_NSEC_SINCE_X(time_now, g_time_last); + } else { + speed_type = speed_type_str[1]; + tmp_speed = speed = offset * 1000000000 / DD_NSEC_SINCE_X(time_now, g_start_time); + } + + i = 0; + + /* Find the rigth unit for speed displaying (Bps vs kBps vs MBps vs GBps vs TBps) */ + while (tmp_speed > 1024) { + tmp_speed >>= 10; + speed_unit <<= 10; + speed_unit_str = unit_str[++i]; + if (i == 4) { + break; + } + } + + printf("\33[2K\rCopying: %" PRIu64 "/%" PRIu64 " [%sB] (%s%" PRIu64 " %sBps)", + (offset + length) / size_unit, g_job.copy_size / size_unit, size_unit_str, speed_type, + speed / speed_unit, speed_unit_str); + fflush(stdout); + + g_data_last = offset; + g_time_last = time_now; +} + +#ifdef SPDK_CONFIG_URING +static void +dd_uring_submit(struct dd_io *io, struct dd_target *target, uint64_t length, uint64_t offset) +{ + struct io_uring_sqe *sqe; + + io->iov.iov_base = io->buf; + io->iov.iov_len = length; + sqe = io_uring_get_sqe(&target->u.uring.ring); + if (io->type == DD_READ || io->type == DD_POPULATE) { + io_uring_prep_readv(sqe, target->u.uring.fd, &io->iov, 1, offset); + } else { + io_uring_prep_writev(sqe, target->u.uring.fd, &io->iov, 1, offset); + } + io_uring_sqe_set_data(sqe, io); + io_uring_submit(&target->u.uring.ring); +} +#endif + +static void +_dd_write_bdev_done(struct spdk_bdev_io *bdev_io, + bool success, + void *cb_arg) +{ + struct dd_io *io = cb_arg; + + assert(g_job.outstanding > 0); + g_job.outstanding--; + spdk_bdev_free_io(bdev_io); + dd_target_populate_buffer(io); +} + +static void +dd_target_write(struct dd_io *io) +{ + struct dd_target *target = &g_job.output; + uint64_t length = SPDK_CEIL_DIV(io->length, target->block_size) * target->block_size; + uint64_t read_region_start = g_opts.input_offset * g_opts.io_unit_size; + uint64_t read_offset = io->offset - read_region_start; + uint64_t write_region_start = g_opts.output_offset * g_opts.io_unit_size; + uint64_t write_offset = write_region_start + read_offset; + int rc = 0; + + if (g_error != 0 || g_interrupt == true) { + if (g_job.outstanding == 0) { + if (g_error == 0) { + dd_show_progress(io->offset, io->length, true); + printf("\n\n"); + } + dd_exit(g_error); + } + return; + } + + dd_show_progress(read_offset, io->length, false); + + g_job.outstanding++; + io->type = DD_WRITE; + + if (target->type == DD_TARGET_TYPE_FILE) { +#ifdef SPDK_CONFIG_URING + if (g_opts.aio == false) { + dd_uring_submit(io, target, length, write_offset); + } else +#endif + { + struct iocb *iocb = &io->iocb; + + io_prep_pwrite(iocb, target->u.aio.fd, io->buf, length, write_offset); + iocb->data = io; + if (io_submit(target->u.aio.io_ctx, 1, &iocb) < 0) { + rc = -errno; + } + } + } else if (target->type == DD_TARGET_TYPE_BDEV) { + rc = spdk_bdev_write(target->u.bdev.desc, target->u.bdev.ch, io->buf, write_offset, length, + _dd_write_bdev_done, io); + } + + if (rc != 0) { + SPDK_ERRLOG("%s\n", strerror(-rc)); + assert(g_job.outstanding > 0); + g_job.outstanding--; + g_error = rc; + if (g_job.outstanding == 0) { + dd_exit(rc); + } + return; + } +} + +static void +_dd_read_bdev_done(struct spdk_bdev_io *bdev_io, + bool success, + void *cb_arg) +{ + struct dd_io *io = cb_arg; + + spdk_bdev_free_io(bdev_io); + + assert(g_job.outstanding > 0); + g_job.outstanding--; + dd_target_write(io); +} + +static void +dd_target_read(struct dd_io *io) +{ + struct dd_target *target = &g_job.input; + int rc = 0; + + if (g_error != 0 || g_interrupt == true) { + if (g_job.outstanding == 0) { + dd_exit(g_error); + } + return; + } + + g_job.outstanding++; + io->type = DD_READ; + + if (target->type == DD_TARGET_TYPE_FILE) { +#ifdef SPDK_CONFIG_URING + if (g_opts.aio == false) { + dd_uring_submit(io, target, io->length, io->offset); + } else +#endif + { + struct iocb *iocb = &io->iocb; + + io_prep_pread(iocb, target->u.aio.fd, io->buf, io->length, io->offset); + iocb->data = io; + if (io_submit(target->u.aio.io_ctx, 1, &iocb) < 0) { + rc = -errno; + } + } + } else if (target->type == DD_TARGET_TYPE_BDEV) { + rc = spdk_bdev_read(target->u.bdev.desc, target->u.bdev.ch, io->buf, io->offset, io->length, + _dd_read_bdev_done, io); + } + + if (rc != 0) { + SPDK_ERRLOG("%s\n", strerror(-rc)); + assert(g_job.outstanding > 0); + g_job.outstanding--; + g_error = rc; + if (g_job.outstanding == 0) { + dd_exit(rc); + } + return; + } +} + +static void +_dd_target_populate_buffer_done(struct spdk_bdev_io *bdev_io, + bool success, + void *cb_arg) +{ + struct dd_io *io = cb_arg; + + assert(g_job.outstanding > 0); + g_job.outstanding--; + spdk_bdev_free_io(bdev_io); + dd_target_read(io); +} + +static void +dd_target_populate_buffer(struct dd_io *io) +{ + struct dd_target *target = &g_job.output; + uint64_t read_region_start = g_opts.input_offset * g_opts.io_unit_size; + uint64_t read_offset = g_job.input.pos - read_region_start; + uint64_t write_region_start = g_opts.output_offset * g_opts.io_unit_size; + uint64_t write_offset = write_region_start + read_offset; + uint64_t length; + int rc = 0; + + io->offset = g_job.input.pos; + io->length = spdk_min((uint64_t)g_opts.io_unit_size, g_job.copy_size - read_offset); + + if (io->length == 0 || g_error != 0 || g_interrupt == true) { + if (g_job.outstanding == 0) { + if (g_error == 0) { + dd_show_progress(read_offset, io->length, true); + printf("\n\n"); + } + dd_exit(g_error); + } + return; + } + + g_job.input.pos += io->length; + + if ((io->length % target->block_size) == 0) { + dd_target_read(io); + return; + } + + /* Read whole blocks from output to combine buffers later */ + g_job.outstanding++; + io->type = DD_POPULATE; + + length = SPDK_CEIL_DIV(io->length, target->block_size) * target->block_size; + + if (target->type == DD_TARGET_TYPE_FILE) { +#ifdef SPDK_CONFIG_URING + if (g_opts.aio == false) { + dd_uring_submit(io, target, length, write_offset); + } else +#endif + { + struct iocb *iocb = &io->iocb; + + io_prep_pread(iocb, target->u.aio.fd, io->buf, length, write_offset); + iocb->data = io; + if (io_submit(target->u.aio.io_ctx, 1, &iocb) < 0) { + rc = -errno; + } + } + } else if (target->type == DD_TARGET_TYPE_BDEV) { + rc = spdk_bdev_read(target->u.bdev.desc, target->u.bdev.ch, io->buf, write_offset, length, + _dd_target_populate_buffer_done, io); + } + + if (rc != 0) { + SPDK_ERRLOG("%s\n", strerror(-rc)); + assert(g_job.outstanding > 0); + g_job.outstanding--; + g_error = rc; + if (g_job.outstanding == 0) { + dd_exit(rc); + } + return; + } +} + +static void +dd_complete_poll(struct dd_io *io) +{ + assert(g_job.outstanding > 0); + g_job.outstanding--; + + switch (io->type) { + case DD_POPULATE: + dd_target_read(io); + break; + case DD_READ: + dd_target_write(io); + break; + case DD_WRITE: + dd_target_populate_buffer(io); + break; + default: + assert(false); + break; + } +} + +#ifdef SPDK_CONFIG_URING +static int +dd_uring_poll(void *ctx) +{ + struct dd_target *target = ctx; + struct io_uring_cqe *cqe; + struct dd_io *io; + int rc = 0; + int i; + + for (i = 0; i < (int)g_opts.queue_depth; i++) { + rc = io_uring_peek_cqe(&target->u.uring.ring, &cqe); + if (rc == 0) { + if (cqe->res == -EAGAIN) { + continue; + } else if (cqe->res < 0) { + SPDK_ERRLOG("%s\n", strerror(-cqe->res)); + g_error = cqe->res; + } + + io = io_uring_cqe_get_data(cqe); + io_uring_cqe_seen(&target->u.uring.ring, cqe); + + dd_complete_poll(io); + } else if (rc != - EAGAIN) { + SPDK_ERRLOG("%s\n", strerror(-rc)); + g_error = rc; + } + } + + return rc; +} +#endif + +static int +dd_aio_poll(io_context_t io_ctx) +{ + struct io_event events[32]; + int rc = 0; + int i; + struct timespec timeout; + struct dd_io *io; + + timeout.tv_sec = 0; + timeout.tv_nsec = 0; + + rc = io_getevents(io_ctx, 0, 32, events, &timeout); + + if (rc < 0) { + SPDK_ERRLOG("%s\n", strerror(-rc)); + dd_exit(rc); + } + + for (i = 0; i < rc; i++) { + io = events[i].data; + if (events[i].res != io->length) { + g_error = rc = -ENOSPC; + } + + dd_complete_poll(io); + } + + return rc; +} + +static int +dd_input_poll(void *ctx) +{ + int rc = 0; + + assert(g_job.input.type == DD_TARGET_TYPE_FILE); + + rc = dd_aio_poll(g_job.input.u.aio.io_ctx); + if (rc == -ENOSPC) { + SPDK_ERRLOG("No more file content to read\n"); + } + + return rc; +} + +static int +dd_output_poll(void *ctx) +{ + int rc = 0; + + assert(g_job.output.type == DD_TARGET_TYPE_FILE); + + rc = dd_aio_poll(g_job.output.u.aio.io_ctx); + if (rc == -ENOSPC) { + SPDK_ERRLOG("No space left on device\n"); + } + + return rc; +} + +static int +dd_open_file(struct dd_target *target, const char *fname, int flags, uint64_t skip_blocks, + bool input) +{ + int *fd; + +#ifdef SPDK_CONFIG_URING + if (g_opts.aio == false) { + fd = &target->u.uring.fd; + } else +#endif + { + fd = &target->u.aio.fd; + } + + flags |= O_RDWR; + + if (input == false && ((flags & O_DIRECTORY) == 0)) { + flags |= O_CREAT; + } + + if (input == false && ((flags & O_APPEND) == 0)) { + flags |= O_TRUNC; + } + +#ifdef SPDK_CONFIG_URING + /* io_uring does not work correctly with O_NONBLOCK flag */ + if (flags & O_NONBLOCK && g_opts.aio == false) { + flags &= ~O_NONBLOCK; + SPDK_WARNLOG("Skipping 'nonblock' flag due to existing issue with uring implementation and this flag\n"); + } +#endif + + target->type = DD_TARGET_TYPE_FILE; + *fd = open(fname, flags, 0600); + if (*fd < 0) { + SPDK_ERRLOG("Could not open file %s: %s\n", fname, strerror(errno)); + return *fd; + } + + target->block_size = spdk_max(spdk_fd_get_blocklen(*fd), 1); + target->total_size = spdk_fd_get_size(*fd); + + if (input == true) { + g_opts.queue_depth = spdk_min(g_opts.queue_depth, + (target->total_size / g_opts.io_unit_size) - skip_blocks + 1); + } + + if (g_opts.io_unit_count != 0) { + g_opts.queue_depth = spdk_min(g_opts.queue_depth, g_opts.io_unit_count); + } + +#ifdef SPDK_CONFIG_URING + if (g_opts.aio == false) { + io_uring_queue_init(g_opts.queue_depth, &target->u.uring.ring, 0); + target->open = true; + return 0; + } else +#endif + { + return io_setup(g_opts.queue_depth, &target->u.aio.io_ctx); + } +} + +static int +dd_open_bdev(struct dd_target *target, const char *bdev_name, uint64_t skip_blocks) +{ + int rc; + + target->type = DD_TARGET_TYPE_BDEV; + target->u.bdev.bdev = spdk_bdev_get_by_name(bdev_name); + if (target->u.bdev.bdev == NULL) { + SPDK_ERRLOG("Could not find bdev %s\n", bdev_name); + return -EINVAL; + } + + target->block_size = spdk_bdev_get_block_size(target->u.bdev.bdev); + target->total_size = spdk_bdev_get_num_blocks(target->u.bdev.bdev) * target->block_size; + + rc = spdk_bdev_open(target->u.bdev.bdev, true, NULL, NULL, &target->u.bdev.desc); + if (rc < 0) { + SPDK_ERRLOG("Could not open bdev %s: %s\n", bdev_name, strerror(-rc)); + return rc; + } + + target->open = true; + + target->u.bdev.ch = spdk_bdev_get_io_channel(target->u.bdev.desc); + if (target->u.bdev.ch == NULL) { + spdk_bdev_close(target->u.bdev.desc); + SPDK_ERRLOG("Could not get I/O channel: %s\n", strerror(ENOMEM)); + return -ENOMEM; + } + + g_opts.queue_depth = spdk_min(g_opts.queue_depth, + (target->total_size / g_opts.io_unit_size) - skip_blocks + 1); + + if (g_opts.io_unit_count != 0) { + g_opts.queue_depth = spdk_min(g_opts.queue_depth, g_opts.io_unit_count); + } + + return 0; +} + +static void dd_finish(void) +{ + /* Interrupt operation */ + g_interrupt = true; +} + +static int +parse_flags(char *file_flags) +{ + char *input_flag; + int flags = 0; + int i; + bool found = false; + + /* Translate input flags to file open flags */ + while ((input_flag = strsep(&file_flags, ","))) { + for (i = 0; g_flags[i].name != NULL; i++) { + if (!strcmp(input_flag, g_flags[i].name)) { + flags |= g_flags[i].flag; + found = true; + break; + } + } + + if (found == false) { + SPDK_ERRLOG("Unknown file flag: %s\n", input_flag); + return -EINVAL; + } + + found = false; + } + + return flags; +} + +static void +dd_run(void *arg1) +{ + uint64_t write_size; + uint32_t i; + int rc, flags = 0; + + if (g_opts.input_file) { + if (g_opts.input_file_flags) { + flags = parse_flags(g_opts.input_file_flags); + } + + if (dd_open_file(&g_job.input, g_opts.input_file, flags, g_opts.input_offset, true) < 0) { + SPDK_ERRLOG("%s: %s\n", g_opts.input_file, strerror(errno)); + dd_exit(-errno); + return; + } +#ifdef SPDK_CONFIG_URING + if (g_opts.aio == false) { + g_job.input.u.uring.poller = spdk_poller_register(dd_uring_poll, &g_job.input, 0); + } else +#endif + { + g_job.input.u.aio.poller = spdk_poller_register(dd_input_poll, NULL, 0); + } + } else if (g_opts.input_bdev) { + rc = dd_open_bdev(&g_job.input, g_opts.input_bdev, g_opts.input_offset); + if (rc < 0) { + SPDK_ERRLOG("%s: %s\n", g_opts.input_bdev, strerror(-rc)); + dd_exit(rc); + return; + } + } + + write_size = g_opts.io_unit_count * g_opts.io_unit_size; + g_job.input.pos = g_opts.input_offset * g_opts.io_unit_size; + + /* We cannot check write size for input files because /dev/zeros, /dev/random, etc would not work. + * We will handle that during copying */ + if (g_opts.input_bdev && g_job.input.pos > g_job.input.total_size) { + SPDK_ERRLOG("--skip value too big (%" PRIu64 ") - only %" PRIu64 " blocks available in input\n", + g_opts.input_offset, g_job.input.total_size / g_opts.io_unit_size); + dd_exit(-ENOSPC); + return; + } + + if (g_opts.io_unit_count != 0 && g_opts.input_bdev && + write_size + g_job.input.pos > g_job.input.total_size) { + SPDK_ERRLOG("--count value too big (%" PRIu64 ") - only %" PRIu64 " blocks available from input\n", + g_opts.io_unit_count, (g_job.input.total_size - g_job.input.pos) / g_opts.io_unit_size); + dd_exit(-ENOSPC); + return; + } + + if (g_opts.io_unit_count != 0) { + g_job.copy_size = write_size; + } else { + g_job.copy_size = g_job.input.total_size - g_job.input.pos; + } + + g_job.output.pos = g_opts.output_offset * g_opts.io_unit_size; + + if (g_opts.output_file) { + flags = 0; + + if (g_opts.output_file_flags) { + flags = parse_flags(g_opts.output_file_flags); + } + + if (dd_open_file(&g_job.output, g_opts.output_file, flags, g_opts.output_offset, false) < 0) { + SPDK_ERRLOG("%s: %s\n", g_opts.output_file, strerror(errno)); + dd_exit(-errno); + return; + } +#ifdef SPDK_CONFIG_URING + if (g_opts.aio == false) { + g_job.output.u.uring.poller = spdk_poller_register(dd_uring_poll, &g_job.output, 0); + } else +#endif + { + g_job.output.u.aio.poller = spdk_poller_register(dd_output_poll, NULL, 0); + } + } else if (g_opts.output_bdev) { + rc = dd_open_bdev(&g_job.output, g_opts.output_bdev, g_opts.output_offset); + if (rc < 0) { + SPDK_ERRLOG("%s: %s\n", g_opts.output_bdev, strerror(-rc)); + dd_exit(rc); + return; + } + + if (g_job.output.pos > g_job.output.total_size) { + SPDK_ERRLOG("--seek value too big (%" PRIu64 ") - only %" PRIu64 " blocks available in output\n", + g_opts.output_offset, g_job.output.total_size / g_opts.io_unit_size); + dd_exit(-ENOSPC); + return; + } + + if (g_opts.io_unit_count != 0 && write_size + g_job.output.pos > g_job.output.total_size) { + SPDK_ERRLOG("--count value too big (%" PRIu64 ") - only %" PRIu64 " blocks available in output\n", + g_opts.io_unit_count, (g_job.output.total_size - g_job.output.pos) / g_opts.io_unit_size); + dd_exit(-ENOSPC); + return; + } + } + + if ((g_job.output.block_size > g_opts.io_unit_size) || + (g_job.input.block_size > g_opts.io_unit_size)) { + SPDK_ERRLOG("--bs value cannot be less than input (%d) neither output (%d) native block size\n", + g_job.input.block_size, g_job.output.block_size); + dd_exit(-EINVAL); + return; + } + + g_job.ios = calloc(g_opts.queue_depth, sizeof(struct dd_io)); + if (g_job.ios == NULL) { + SPDK_ERRLOG("%s\n", strerror(ENOMEM)); + dd_exit(-ENOMEM); + return; + } + + for (i = 0; i < g_opts.queue_depth; i++) { + g_job.ios[i].buf = spdk_malloc(g_opts.io_unit_size, 0x1000, NULL, 0, SPDK_MALLOC_DMA); + if (g_job.ios[i].buf == NULL) { + SPDK_ERRLOG("%s - try smaller block size value\n", strerror(ENOMEM)); + dd_exit(-ENOMEM); + return; + } + } + + clock_gettime(CLOCK_REALTIME, &g_start_time); + + for (i = 0; i < g_opts.queue_depth; i++) { + dd_target_populate_buffer(&g_job.ios[i]); + } + +} + +enum dd_cmdline_opts { + DD_OPTION_IF = 0x1000, + DD_OPTION_OF, + DD_OPTION_IFLAGS, + DD_OPTION_OFLAGS, + DD_OPTION_IB, + DD_OPTION_OB, + DD_OPTION_SKIP, + DD_OPTION_SEEK, + DD_OPTION_BS, + DD_OPTION_QD, + DD_OPTION_COUNT, + DD_OPTION_AIO, +}; + +static struct option g_cmdline_opts[] = { + { + .name = "if", + .has_arg = 1, + .flag = NULL, + .val = DD_OPTION_IF, + }, + { + .name = "of", + .has_arg = 1, + .flag = NULL, + .val = DD_OPTION_OF, + }, + { + .name = "iflag", + .has_arg = 1, + .flag = NULL, + .val = DD_OPTION_IFLAGS, + }, + { + .name = "oflag", + .has_arg = 1, + .flag = NULL, + .val = DD_OPTION_OFLAGS, + }, + { + .name = "ib", + .has_arg = 1, + .flag = NULL, + .val = DD_OPTION_IB, + }, + { + .name = "ob", + .has_arg = 1, + .flag = NULL, + .val = DD_OPTION_OB, + }, + { + .name = "skip", + .has_arg = 1, + .flag = NULL, + .val = DD_OPTION_SKIP, + }, + { + .name = "seek", + .has_arg = 1, + .flag = NULL, + .val = DD_OPTION_SEEK, + }, + { + .name = "bs", + .has_arg = 1, + .flag = NULL, + .val = DD_OPTION_BS, + }, + { + .name = "qd", + .has_arg = 1, + .flag = NULL, + .val = DD_OPTION_QD, + }, + { + .name = "count", + .has_arg = 1, + .flag = NULL, + .val = DD_OPTION_COUNT, + }, + { + .name = "aio", + .has_arg = 0, + .flag = NULL, + .val = DD_OPTION_AIO, + }, + { + .name = NULL + } +}; + +static void +usage(void) +{ + printf("[--------- DD Options ---------]\n"); + printf(" --if Input file. Must specify either --if or --ib.\n"); + printf(" --ib Input bdev. Must specifier either --if or --ib\n"); + printf(" --of Output file. Must specify either --of or --ob.\n"); + printf(" --ob Output bdev. Must specify either --of or --ob.\n"); + printf(" --iflag Input file flags.\n"); + printf(" --oflag Onput file flags.\n"); + printf(" --bs I/O unit size (default: %" PRId64 ")\n", g_opts.io_unit_size); + printf(" --qd Queue depth (default: %d)\n", g_opts.queue_depth); + printf(" --count I/O unit count. The number of I/O units to copy. (default: all)\n"); + printf(" --skip Skip this many I/O units at start of input. (default: 0)\n"); + printf(" --seek Skip this many I/O units at start of output. (default: 0)\n"); + printf(" --aio Force usage of AIO. (by default io_uring is used if available)\n"); + printf(" Available iflag and oflag values:\n"); + printf(" append - append mode\n"); + printf(" direct - use direct I/O for data\n"); + printf(" directory - fail unless a directory\n"); + printf(" dsync - use synchronized I/O for data\n"); + printf(" noatime - do not update access time\n"); + printf(" noctty - do not assign controlling terminal from file\n"); + printf(" nofollow - do not follow symlinks\n"); + printf(" nonblock - use non-blocking I/O\n"); + printf(" sync - use synchronized I/O for data and metadata\n"); +} + +static int +parse_args(int argc, char *argv) +{ + switch (argc) { + case DD_OPTION_IF: + g_opts.input_file = strdup(argv); + break; + case DD_OPTION_OF: + g_opts.output_file = strdup(argv); + break; + case DD_OPTION_IFLAGS: + g_opts.input_file_flags = strdup(argv); + break; + case DD_OPTION_OFLAGS: + g_opts.output_file_flags = strdup(argv); + break; + case DD_OPTION_IB: + g_opts.input_bdev = strdup(argv); + break; + case DD_OPTION_OB: + g_opts.output_bdev = strdup(argv); + break; + case DD_OPTION_SKIP: + g_opts.input_offset = spdk_strtol(optarg, 10); + break; + case DD_OPTION_SEEK: + g_opts.output_offset = spdk_strtol(optarg, 10); + break; + case DD_OPTION_BS: + g_opts.io_unit_size = spdk_strtol(optarg, 10); + break; + case DD_OPTION_QD: + g_opts.queue_depth = spdk_strtol(optarg, 10); + break; + case DD_OPTION_COUNT: + g_opts.io_unit_count = spdk_strtol(optarg, 10); + break; + case DD_OPTION_AIO: + g_opts.aio = true; + break; + default: + usage(); + return 1; + } + return 0; +} + +static void +dd_free(void) +{ + uint32_t i; + + free(g_opts.input_file); + free(g_opts.output_file); + free(g_opts.input_bdev); + free(g_opts.output_bdev); + free(g_opts.input_file_flags); + free(g_opts.output_file_flags); + +#ifdef SPDK_CONFIG_URING + if (g_opts.aio == false) { + if (g_job.input.type == DD_TARGET_TYPE_FILE && g_job.input.open == true) { + io_uring_queue_exit(&g_job.input.u.uring.ring); + } + + if (g_job.output.type == DD_TARGET_TYPE_FILE && g_job.output.open == true) { + io_uring_queue_exit(&g_job.output.u.uring.ring); + } + } +#endif + + if (g_job.ios) { + for (i = 0; i < g_opts.queue_depth; i++) { + spdk_free(g_job.ios[i].buf); + } + + free(g_job.ios); + } +} + +int +main(int argc, char **argv) +{ + struct spdk_app_opts opts = {}; + int rc = 1; + + spdk_app_opts_init(&opts); + opts.name = "spdk_dd"; + opts.reactor_mask = "0x1"; + opts.shutdown_cb = dd_finish; + rc = spdk_app_parse_args(argc, argv, &opts, "", g_cmdline_opts, parse_args, usage); + if (rc == SPDK_APP_PARSE_ARGS_FAIL) { + SPDK_ERRLOG("Invalid arguments\n"); + goto end; + } else if (rc == SPDK_APP_PARSE_ARGS_HELP) { + goto end; + } + + if (g_opts.input_file != NULL && g_opts.input_bdev != NULL) { + SPDK_ERRLOG("You may specify either --if or --ib, but not both.\n"); + rc = EINVAL; + goto end; + } + + if (g_opts.output_file != NULL && g_opts.output_bdev != NULL) { + SPDK_ERRLOG("You may specify either --of or --ob, but not both.\n"); + rc = EINVAL; + goto end; + } + + if (g_opts.input_file == NULL && g_opts.input_bdev == NULL) { + SPDK_ERRLOG("You must specify either --if or --ib\n"); + rc = EINVAL; + goto end; + } + + if (g_opts.output_file == NULL && g_opts.output_bdev == NULL) { + SPDK_ERRLOG("You must specify either --of or --ob\n"); + rc = EINVAL; + goto end; + } + + if (g_opts.io_unit_size <= 0) { + SPDK_ERRLOG("Invalid --bs value\n"); + rc = EINVAL; + goto end; + } + + if (g_opts.io_unit_count < 0) { + SPDK_ERRLOG("Invalid --count value\n"); + rc = EINVAL; + goto end; + } + + if (g_opts.output_file == NULL && g_opts.output_file_flags != NULL) { + SPDK_ERRLOG("--oflags may be used only with --of\n"); + rc = EINVAL; + goto end; + } + + if (g_opts.input_file == NULL && g_opts.input_file_flags != NULL) { + SPDK_ERRLOG("--iflags may be used only with --if\n"); + rc = EINVAL; + goto end; + } + + rc = spdk_app_start(&opts, dd_run, NULL); + if (rc) { + SPDK_ERRLOG("Error occured while performing copy\n"); + } + + dd_free(); + spdk_app_fini(); + +end: + return rc; +} diff --git a/app/spdk_tgt/Makefile b/app/spdk_tgt/Makefile index 4f6eef01346..43583f85577 100644 --- a/app/spdk_tgt/Makefile +++ b/app/spdk_tgt/Makefile @@ -50,7 +50,7 @@ endif endif endif -SPDK_LIB_LIST += event_bdev event_accel event_iscsi event_net event_scsi event_nvmf event_vmd event +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) event_iscsi event_net event_scsi event_nvmf event SPDK_LIB_LIST += nvmf trace log conf thread util bdev iscsi scsi accel rpc jsonrpc json SPDK_LIB_LIST += app_rpc log_rpc bdev_rpc net sock notify diff --git a/app/spdk_top/spdk_top.c b/app/spdk_top/spdk_top.c index 0b7c9269aaf..8531cd32a66 100644 --- a/app/spdk_top/spdk_top.c +++ b/app/spdk_top/spdk_top.c @@ -1083,6 +1083,8 @@ refresh_pollers_tab(uint8_t current_page) g_last_page = current_page; } + max_pages = (count + g_max_data_rows - 1) / g_max_data_rows; + /* Clear screen if number of pollers changed */ if (g_last_pollers_count != count) { for (i = TABS_DATA_START_ROW; i < g_data_win_size; i++) { @@ -1092,9 +1094,13 @@ refresh_pollers_tab(uint8_t current_page) } g_last_pollers_count = count; - } - max_pages = (count + g_max_data_rows - 1) / g_max_data_rows; + /* We need to run store_last_run_counter() again, so the easiest way is to call this function + * again with changed g_last_page value */ + g_last_page = 0xF; + refresh_pollers_tab(current_page); + return max_pages; + } /* Timed pollers can switch their possition on a list because of how they work. * Let's sort them by name first so that they won't switch on data refresh */ diff --git a/app/trace/Makefile b/app/trace/Makefile index 92df0857f2d..92fb60cf6e9 100644 --- a/app/trace/Makefile +++ b/app/trace/Makefile @@ -36,6 +36,7 @@ include $(SPDK_ROOT_DIR)/mk/spdk.common.mk include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk APP = spdk_trace +SPDK_NO_LINK_ENV = 1 CXX_SRCS := trace.cpp diff --git a/app/vhost/Makefile b/app/vhost/Makefile index 2106a23d37b..4bb28b20ecb 100644 --- a/app/vhost/Makefile +++ b/app/vhost/Makefile @@ -46,7 +46,7 @@ ifeq ($(CONFIG_VHOST_INTERNAL_LIB),y) SPDK_LIB_LIST += rte_vhost endif -SPDK_LIB_LIST += event_bdev event_accel event_net event_scsi event_vmd event +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) event_net event_scsi event SPDK_LIB_LIST += jsonrpc json rpc bdev_rpc bdev scsi accel trace conf SPDK_LIB_LIST += thread util log log_rpc app_rpc SPDK_LIB_LIST += event_nbd nbd net sock notify diff --git a/autobuild.sh b/autobuild.sh index 3613ed56fe9..5d700e7c704 100755 --- a/autobuild.sh +++ b/autobuild.sh @@ -14,7 +14,7 @@ source "$1" source "$rootdir/test/common/autotest_common.sh" out=$output_dir -scanbuild="scan-build -o $output_dir/scan-build-tmp --status-bugs" +scanbuild="scan-build -o $output_dir/scan-build-tmp --exclude $rootdir/dpdk/ --status-bugs" config_params=$(get_config_params) trap '[[ -d $SPDK_WORKSPACE ]] && rm -rf "$SPDK_WORKSPACE"' 0 @@ -156,6 +156,7 @@ function autobuild_test_suite() { run_test "autobuild_ocf_precompile" ocf_precompile fi run_test "autobuild_check_so_deps" $rootdir/test/make/check_so_deps.sh $1 + ./configure $config_params --without-shared run_test "scanbuild_make" scanbuild_make run_test "autobuild_generated_files_check" porcelain_check run_test "autobuild_header_dependency_check" header_dependency_check @@ -182,5 +183,7 @@ else if [ "$SPDK_TEST_OCF" -eq 1 ]; then run_test "autobuild_ocf_precompile" ocf_precompile fi + # if we aren't testing the unittests, build with shared objects. + ./configure $config_params --with-shared run_test "make" $MAKE $MAKEFLAGS fi diff --git a/autorun.sh b/autorun.sh index 0e5c4c3de83..cdc27a3d50b 100755 --- a/autorun.sh +++ b/autorun.sh @@ -17,5 +17,5 @@ cat "$conf" # Runs agent scripts $rootdir/autobuild.sh "$conf" -sudo -E WITH_DPDK_DIR="$WITH_DPDK_DIR" $rootdir/autotest.sh "$conf" +sudo -E $rootdir/autotest.sh "$conf" $rootdir/autopackage.sh "$conf" diff --git a/autotest.sh b/autotest.sh index b24dbffea28..d14e65ddc2c 100755 --- a/autotest.sh +++ b/autotest.sh @@ -13,12 +13,18 @@ source "$1" source "$rootdir/test/common/autotest_common.sh" source "$rootdir/test/nvmf/common.sh" +# always test with SPDK shared objects. +export SPDK_LIB_DIR="$rootdir/build/lib" +export DPDK_LIB_DIR="$rootdir/dpdk/build/lib" +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$SPDK_LIB_DIR:$DPDK_LIB_DIR + if [ $EUID -ne 0 ]; then echo "$0 must be run as root" exit 1 fi if [ $(uname -s) = Linux ]; then + old_core_pattern=$(< /proc/sys/kernel/core_pattern) # set core_pattern to a known value to avoid ABRT, systemd-coredump, etc. echo "core" > /proc/sys/kernel/core_pattern @@ -30,6 +36,11 @@ if [ $(uname -s) = Linux ]; then # make sure nbd (network block device) driver is loaded if it is available # this ensures that when tests need to use nbd, it will be fully initialized modprobe nbd || true + + if udevadm=$(type -P udevadm); then + "$udevadm" monitor --property &> "$output_dir/udev.log" & + udevadm_pid=$! + fi fi trap "process_core; autotest_cleanup; exit 1" SIGINT SIGTERM EXIT @@ -72,9 +83,6 @@ rm -f /var/tmp/spdk*.sock # Load the kernel driver ./scripts/setup.sh reset -# Let the kernel discover any filesystems or partitions -sleep 10 - if [ $(uname -s) = Linux ]; then # OCSSD devices drivers don't support IO issues by kernel so # detect OCSSD devices and blacklist them (unbind from any driver). @@ -143,8 +151,7 @@ fi # Revert existing OPAL to factory settings that may have been left from earlier failed tests. # This ensures we won't hit any unexpected failures due to NVMe SSDs being locked. -# Disable this for now as we don't have opal test running -# opal_revert_cleanup +opal_revert_cleanup ##################### # Unit Tests @@ -152,12 +159,12 @@ fi if [ $SPDK_TEST_UNITTEST -eq 1 ]; then run_test "unittest" ./test/unit/unittest.sh + run_test "env" test/env/env.sh fi if [ $SPDK_RUN_FUNCTIONAL_TEST -eq 1 ]; then timing_enter lib - run_test "env" test/env/env.sh run_test "rpc" test/rpc/rpc.sh run_test "rpc_client" test/rpc_client/rpc_client.sh run_test "json_config" ./test/json_config/json_config.sh @@ -169,6 +176,10 @@ if [ $SPDK_RUN_FUNCTIONAL_TEST -eq 1 ]; then if [ $SPDK_TEST_BLOCKDEV -eq 1 ]; then run_test "blockdev_general" test/bdev/blockdev.sh run_test "bdev_raid" test/bdev/bdev_raid.sh + run_test "bdevperf_config" test/bdev/bdevperf/test_config.sh + if [[ $(uname -s) == Linux ]]; then + run_test "spdk_dd" test/dd/dd.sh + fi fi if [ $SPDK_TEST_JSON -eq 1 ]; then @@ -201,23 +212,19 @@ if [ $SPDK_RUN_FUNCTIONAL_TEST -eq 1 ]; then timing_exit lib if [ $SPDK_TEST_ISCSI -eq 1 ]; then - run_test "iscsi_tgt_posix" ./test/iscsi_tgt/iscsi_tgt.sh posix + run_test "iscsi_tgt" ./test/iscsi_tgt/iscsi_tgt.sh run_test "spdkcli_iscsi" ./test/spdkcli/iscsi.sh # Run raid spdkcli test under iSCSI since blockdev tests run on systems that can't run spdkcli yet run_test "spdkcli_raid" test/spdkcli/raid.sh fi - if [ $SPDK_TEST_VPP -eq 1 ]; then - run_test "iscsi_tgt_vpp" ./test/iscsi_tgt/iscsi_tgt.sh vpp - fi - if [ $SPDK_TEST_BLOBFS -eq 1 ]; then run_test "rocksdb" ./test/blobfs/rocksdb/rocksdb.sh run_test "blobstore" ./test/blobstore/blobstore.sh run_test "blobfs" ./test/blobfs/blobfs.sh run_test "hello_blob" $SPDK_EXAMPLE_DIR/hello_blob \ - examples/blob/hello_world/hello_blob.conf + examples/blob/hello_world/hello_blob.json fi if [ $SPDK_TEST_NVMF -eq 1 ]; then diff --git a/configure b/configure index 4c9772adf45..7876ea7056f 100755 --- a/configure +++ b/configure @@ -68,8 +68,6 @@ function usage() echo " example: /usr/share/pmdk" echo " reduce Build vbdev compression module." echo " No path required." - echo " vpp Build VPP net module." - echo " example: /vpp_repo/build-root/rpmbuild/vpp-18.01.1.0/build-root/install-vpp-native/vpp" echo " rbd Build Ceph RBD bdev module." echo " No path required." echo " rdma Build RDMA transport for NVMf target and initiator." @@ -363,17 +361,6 @@ for i in "$@"; do --without-reduce) CONFIG[REDUCE]=n ;; - --with-vpp) - CONFIG[VPP]=y - ;; - --with-vpp=*) - CONFIG[VPP]=y - check_dir "$i" - CONFIG[VPP_DIR]=$(readlink -f ${i#*=}) - ;; - --without-vpp) - CONFIG[VPP]=n - ;; --with-fio) ;& --with-fio=*) if [[ ${i#*=} != "$i" ]]; then @@ -706,18 +693,6 @@ if [[ "${CONFIG[REDUCE]}" = "y" ]]; then fi fi -if [[ "${CONFIG[VPP]}" = "y" ]]; then - if [ ! -z "${CONFIG[VPP_DIR]}" ]; then - VPP_CFLAGS="-L${CONFIG[VPP_DIR]}/lib -I${CONFIG[VPP_DIR]}/include" - fi - if ! echo -e '#include \nint main(void) { return 0; }\n' \ - | ${BUILD_CMD[@]} ${VPP_CFLAGS} -lvppinfra -lsvm -lvlibmemoryclient - 2>/dev/null; then - echo --with-vpp requires installed vpp. - echo Please install then re-run this script. - exit 1 - fi -fi - if [[ "${CONFIG[NVME_CUSE]}" = "y" ]]; then if ! echo -e '#define FUSE_USE_VERSION 31\n#include \n#include \n#include \nint main(void) { return 0; }\n' \ | ${BUILD_CMD[@]} -lfuse3 -D_FILE_OFFSET_BITS=64 - 2>/dev/null; then @@ -766,6 +741,8 @@ if [[ "${CONFIG[UBSAN]}" = "y" ]]; then | ${BUILD_CMD[@]} -fsanitize=undefined - 2>/dev/null; then echo --enable-ubsan requires libubsan. echo Please install then re-run this script. + echo If installed, please check that the GCC version is at least 6.4 \ + and synchronize CC accordingly. exit 1 fi fi diff --git a/doc/Doxyfile b/doc/Doxyfile index ccaaf8988b9..24dba406336 100644 --- a/doc/Doxyfile +++ b/doc/Doxyfile @@ -800,8 +800,10 @@ INPUT += \ # All remaining pages are listed here in alphabetical order by filename. INPUT += \ about.md \ + accel_fw.md \ applications.md \ bdev.md \ + bdevperf.md \ bdev_module.md \ bdev_pg.md \ blob.md \ @@ -833,6 +835,7 @@ INPUT += \ peer_2_peer.md \ porting.md \ spdkcli.md \ + spdk_top.md \ ssd_internals.md \ system_configuration.md \ userspace.md \ @@ -840,8 +843,7 @@ INPUT += \ vhost.md \ vhost_processing.md \ virtio.md \ - vmd.md \ - vpp_integration.md + vmd.md # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/doc/accel_fw.md b/doc/accel_fw.md new file mode 100644 index 00000000000..ccedc3e8424 --- /dev/null +++ b/doc/accel_fw.md @@ -0,0 +1,107 @@ +# Acceleration Framework {#accel_fw} + +SPDK provides a framework for abstracting general acceleration capabilities +that can be implemented through plug-in modules and low-level libraries. These +plug-in modules include support for hardware acceleration engines such as +the Intel(R) I/O Acceleration Technology (IOAT) engine and the Intel(R) Data +Streaming Accelerator (DSA) engine. Additionally, a software plug-in module +exists to enable use of the framework in environments without hardware +acceleration capabilities. ISA/L is used for optimized CRC32C calculation within +the software module. + +The framework includes an API for getting the current capabilities of the +selected module. See [`spdk_accel_get_capabilities`](https://spdk.io/doc/accel__engine_8h.html) for more details. For the software module, all capabilities will be reported as supported. For the hardware modules, only functions accelerated by hardware will be reported however any function can still be called, it will just be backed by software if it is not reported as a supported capability. + +# Acceleration Framework Functions {#accel_functions} + +Functions implemented via the framework can be found in the DoxyGen documentation of the +framework public header file here [accel_engine.h](https://spdk.io/doc/accel__engine_8h.html) + +# Acceleration Framework Design Considerations {#accel_dc} + +The general interface is defined by `/include/accel_engine.h` and implemented +in `/lib/accel`. These functions may be called by an SPDK application and in +most cases, except where otherwise documented, are asynchronous and follow the +standard SPDK model for callbacks with a callback argument. + +If the acceleration framework is started without initializing a hardware module, +optimized software implementations of the functions will back the public API. +Additionally, if any hardware module does not support a specific function and that +hardware module is initialized, the specific function will fallback to a software +optimized implementation. For example, IOAT does not support the dualcast function +in hardware but if the IOAT module has been initialized and the public dualcast API +is called, it will actually be done via software behind the scenes. + +# Acceleration Low Level Libraries {#accel_libs} + +Low level libraries provide only the most basic functions that are specific to +the hardware. Low level libraries are located in the '/lib' directory with the +exception of the software implementation which is implemented as part of the +framework itself. The software low level library does not expose a public API. +Applications may choose to interact directly with a low level library if there are +specific needs/considerations not met via accessing the library through the +framework/module. Note that when using the low level libraries directly, the +framework abstracted interface is bypassed as the application will call the public +functions exposed by the individual low level libraries. Thus, code written this +way needs to be certain that the underlying hardware exists everywhere that it runs. + +The low level library for IOAT is located in `/lib/ioat`. The low level library +for DSA is in `/liv/idxd` (IDXD stands for Intel(R) Data Acceleration Driver). + +# Acceleration Plug-In Modules {#accel_modules} + +Plug-in modules depend on low level libraries to interact with the hardware and +add additional functionality such as queueing during busy conditions or flow +control in some cases. The framework in turn depends on the modules to provide +the complete implementation of the acceleration component. A module must be +selected via startup RPC when the application is started. Otherwise, if no startup +RPC is provided, the framework is available and will use the software plug-in module. + +## IOAT Module {#accel_ioat} + +To use the IOAT engine, use the RPC [`ioat_scan_accel_engine`](https://spdk.io/doc/jsonrpc.html) before starting the application. + +## IDXD Module {#accel_idxd} + +To use the DSA engine, use the RPC [`idxd_scan_accel_engine`](https://spdk.io/doc/jsonrpc.html) with an optional parameter of `-c` and provide a configuration number of either 0 or 1. These pre-defined configurations determine how the DSA engine will be setup in terms +of work queues and engines. The DSA engine is very flexible allowing for various configurations of these elements to either account for different quality of service requirements or to isolate hardware paths where the back end media is of varying latency (i.e. persistent memory vs DRAM). The pre-defined configurations are as follows: + +0: Four separate work queues each backed with one DSA engine. This is a generic +configuration that provides 4 portals to submit operations to each with a +single engine behind it providing some level of isolation as operations are +submitted round-robin. + +1: Two separate work queues each backed with two DSA engines. This is another +generic configuration that provides 2 portals to submit operations to and +lets the DSA hardware decide which engine to select based on loading. + +There are several other configurations that are possible that include quality +of service parameters on the work queues that are not currently utilized by +the module. Specialized use of DSA may require different configurations that +can be added to the module as needed. + +## Software Module {#accel_sw} + +The software module is enabled by default. If no hardware engine is explicitly +enabled via startup RPC as discussed earlier, the software module will use ISA-L +if available for functions such as CRC32C. Otherwise, standard glibc calls are +used to back the framework API. + +## Batching {#batching} + +Batching is exposed by the acceleration framework and provides an interface to +batch sets of commands up and then submit them with a single command. The public +API is consistent with the implementation however each plug-in module behaves +differently depending on its capabilities. + +The DSA engine has complete support for batching all supported commands together +into one submission. This is advantageous as it reduces the overhead incurred in +the submission process to the hardware. + +The software engine supports batching only to be consistent with the framework API. +In software there is no savings by batching sets of commands versus submitting them +individually. + +The IOAT engine supports batching but it is only beneficial for `memmove` and `memfill` +as these are supported by the hardware. All other commands can be batched and the +framework will manage all other commands via software. diff --git a/doc/bdev.md b/doc/bdev.md index 9f842943f93..1ab68e08c4c 100644 --- a/doc/bdev.md +++ b/doc/bdev.md @@ -1,5 +1,9 @@ # Block Device User Guide {#bdev} +# Target Audience {#bdev_ug_targetaudience} + +This user guide is intended for software developers who have knowledge of block storage, storage drivers, issuing JSON-RPC commands and storage services such as RAID, compression, crypto, and others. + # Introduction {#bdev_ug_introduction} The SPDK block device layer, often simply called *bdev*, is a C library @@ -35,72 +39,12 @@ directly from SPDK application by running `scripts/rpc.py rpc_get_methods`. Detailed help for each command can be displayed by adding `-h` flag as a command parameter. -# General Purpose RPCs {#bdev_ug_general_rpcs} - -## bdev_get_bdevs {#bdev_ug_get_bdevs} - -List of currently available block devices including detailed information about -them can be get by using `bdev_get_bdevs` RPC command. User can add optional -parameter `name` to get details about specified by that name bdev. - -Example response - -~~~ -{ - "num_blocks": 32768, - "assigned_rate_limits": { - "rw_ios_per_sec": 10000, - "rw_mbytes_per_sec": 20 - }, - "supported_io_types": { - "reset": true, - "nvme_admin": false, - "unmap": true, - "read": true, - "write_zeroes": true, - "write": true, - "flush": true, - "nvme_io": false - }, - "driver_specific": {}, - "claimed": false, - "block_size": 4096, - "product_name": "Malloc disk", - "name": "Malloc0" -} -~~~ - -## bdev_set_qos_limit {#bdev_set_qos_limit} - -Users can use the `bdev_set_qos_limit` RPC command to enable, adjust, and disable -rate limits on an existing bdev. Two types of rate limits are supported: -IOPS and bandwidth. The rate limits can be enabled, adjusted, and disabled at any -time for the specified bdev. The bdev name is a required parameter for this -RPC command and at least one of `rw_ios_per_sec` and `rw_mbytes_per_sec` must be -specified. When both rate limits are enabled, the first met limit will -take effect. The value 0 may be specified to disable the corresponding rate -limit. Users can run this command with `-h` or `--help` for more information. - -## Histograms {#rpc_bdev_histogram} - -The `bdev_enable_histogram` RPC command allows to enable or disable gathering -latency data for specified bdev. Histogram can be downloaded by the user by -calling `bdev_get_histogram` and parsed using scripts/histogram.py script. - -Example command - -`rpc.py bdev_enable_histogram Nvme0n1 --enable` - -The command will enable gathering data for histogram on Nvme0n1 device. - -`rpc.py bdev_get_histogram Nvme0n1 | histogram.py` +# Configuring Block Device Modules {#bdev_ug_general_rpcs} -The command will download gathered histogram data. The script will parse -the data and show table containing IO count for latency ranges. +Block devices can be configured using JSON RPCs. A complete list of available RPC commands +with detailed information can be found on the @ref jsonrpc_components_bdev page. -`rpc.py bdev_enable_histogram Nvme0n1 --disable` - -The command will disable histogram on Nvme0n1 device. +# Common Block Device Configuration Examples # Ceph RBD {#bdev_config_rbd} @@ -378,6 +322,14 @@ please visit [OCF documentation](https://open-cas.github.io/). Malloc bdevs are ramdisks. Because of its nature they are volatile. They are created from hugepage memory given to SPDK application. +Example command for creating malloc bdev: + +`rpc.py bdev_malloc_create -b Malloc0 64 512` + +Example command for removing malloc bdev: + +`rpc.py bdev_malloc_delete Malloc0` + # Null {#bdev_config_null} The SPDK null bdev driver is a dummy block I/O target that discards all writes and returns undefined @@ -460,7 +412,6 @@ User can get list of available lvol stores using `bdev_lvol_get_lvstores` RPC co parameters available). Example response - ~~~ { "uuid": "330a6ab2-f468-11e7-983e-001e67edf35d", @@ -492,26 +443,6 @@ Example commands `rpc.py bdev_lvol_create lvol2 25 -u 330a6ab2-f468-11e7-983e-001e67edf35d` -# RAID {#bdev_ug_raid} - -RAID virtual bdev module provides functionality to combine any SPDK bdevs into -one RAID bdev. Currently SPDK supports only RAID 0. RAID functionality does not -store on-disk metadata on the member disks, so user must recreate the RAID -volume when restarting application. User may specify member disks to create RAID -volume event if they do not exists yet - as the member disks are registered at -a later time, the RAID module will claim them and will surface the RAID volume -after all of the member disks are available. It is allowed to use disks of -different sizes - the smallest disk size will be the amount of space used on -each member disk. - -Example commands - -`rpc.py bdev_raid_create -n Raid0 -z 64 -r 0 -b "lvol0 lvol1 lvol2 lvol3"` - -`rpc.py bdev_raid_get_bdevs` - -`rpc.py bdev_raid_delete Raid0` - # Passthru {#bdev_config_passthru} The SPDK Passthru virtual block device module serves as an example of how to write a @@ -561,6 +492,65 @@ To remove a block device representation use the bdev_pmem_delete command. `rpc.py bdev_pmem_delete pmem` +# RAID {#bdev_ug_raid} + +RAID virtual bdev module provides functionality to combine any SPDK bdevs into +one RAID bdev. Currently SPDK supports only RAID 0. RAID functionality does not +store on-disk metadata on the member disks, so user must recreate the RAID +volume when restarting application. User may specify member disks to create RAID +volume event if they do not exists yet - as the member disks are registered at +a later time, the RAID module will claim them and will surface the RAID volume +after all of the member disks are available. It is allowed to use disks of +different sizes - the smallest disk size will be the amount of space used on +each member disk. + +Example commands + +`rpc.py bdev_raid_create -n Raid0 -z 64 -r 0 -b "lvol0 lvol1 lvol2 lvol3"` + +`rpc.py bdev_raid_get_bdevs` + +`rpc.py bdev_raid_delete Raid0` + +# Split {#bdev_ug_split} + +The split block device module takes an underlying block device and splits it into +several smaller equal-sized virtual block devices. This serves as an example to create +more vbdevs on a given base bdev for user testing. + +Example commands + +To create four split bdevs with base bdev_b0 use the `bdev_split_create` command. +Each split bdev will be one fourth the size of the base bdev. + +`rpc.py bdev_split_create bdev_b0 4` + +The `split_size_mb`(-s) parameter restricts the size of each split bdev. +The total size of all split bdevs must not exceed the base bdev size. + +`rpc.py bdev_split_create bdev_b0 4 -s 128` + +To remove the split bdevs, use the `bdev_split_delete` command with the base bdev name. + +`rpc.py bdev_split_delete bdev_b0` + +# Uring {#bdev_ug_uring} + +The uring bdev module issues I/O to kernel block devices using the io_uring Linux kernel API. This module requires liburing. +For more information on io_uring refer to kernel [IO_uring] (https://kernel.dk/io_uring.pdf) + +The user needs to configure SPDK to include io_uring support: + +`configure --with-uring` + +To create a uring bdev with given filename, bdev name and block size use the `bdev_uring_create` RPC. + +`rpc.py bdev_uring_create /path/to/device bdev_u0 512` + +To remove a uring bdev use the `bdev_uring_delete` RPC. + +`rpc.py bdev_uring_delete bdev_u0` + # Virtio Block {#bdev_config_virtio_blk} The Virtio-Block driver allows creating SPDK bdevs from Virtio-Block devices. diff --git a/doc/bdev_module.md b/doc/bdev_module.md index f6c5392cc00..191a1cf535f 100644 --- a/doc/bdev_module.md +++ b/doc/bdev_module.md @@ -18,7 +18,7 @@ how to write a module. ## Creating A New Module -Block device modules are located in subdirectories under lib/bdev today. It is not +Block device modules are located in subdirectories under module/bdev today. It is not currently possible to place the code for a bdev module elsewhere, but updates to the build system could be made to enable this in the future. To create a module, add a new directory with a single C file and a Makefile. A great @@ -137,6 +137,15 @@ block device. Once the I/O request is completed, the module must call spdk_bdev_io_complete(). The I/O does not have to finish within the calling context of `submit_request`. +Integrating a new bdev module into the build system requires updates to various +files in the /mk directory. + +## Creating Bdevs in an External Repository + +A User can build their own bdev module and application on top of existing SPDK libraries. The example in +test/external_code serves as a template for creating, building and linking an external +bdev module. Refer to test/external_code/README.md and @ref so_linking for further information. + ## Creating Virtual Bdevs Block devices are considered virtual if they handle I/O requests by routing diff --git a/doc/bdevperf.md b/doc/bdevperf.md new file mode 100644 index 00000000000..8c5c5828c20 --- /dev/null +++ b/doc/bdevperf.md @@ -0,0 +1,86 @@ +# Using bdevperf application {#bdevperf} + +## Introduction + +bdevperf is an SPDK application that is used for performance testing +of block devices (bdevs) exposed by the SPDK bdev layer. It is an +alternative to the SPDK bdev fio plugin for benchmarking SPDK bdevs. +In some cases, bdevperf can provide much lower overhead than the fio +plugin, resulting in much better performance for tests using a limited +number of CPU cores. + +bdevperf exposes command line interface that allows to specify +SPDK framework options as well as testing options. +Since SPDK 20.07, bdevperf supports configuration file that is similar +to FIO. It allows user to create jobs parameterized by +filename, cpumask, blocksize, queuesize, etc. + +## Config file + +Bdevperf's config file is similar to FIO's config file format. + +Below is an example config file that uses all available parameters: + +~~~{.ini} +[global] +filename=Malloc0:Malloc1 +bs=1024 +iosize=256 +rw=randrw +rwmixread=90 + +[A] +cpumask=0xff + +[B] +cpumask=[0-128] +filename=Malloc1 + +[global] +filename=Malloc0 +rw=write + +[C] +bs=4096 +iosize=128 +offset=1000000 +length=1000000 +~~~ + +Jobs `[A]` `[B]` or `[C]`, inherit default values from `[global]` +section residing above them. So in the example, job `[A]` inherits +`filename` value and uses both `Malloc0` and `Malloc1` bdevs as targets, +job `[B]` overrides its `filename` value and uses `Malloc1` and +job `[C]` inherits value `Malloc0` for its `filename`. + +Interaction with CLI arguments is not the same as in FIO however. +If bdevperf receives CLI argument, it overrides values +of corresponding parameter for all `[global]` sections of config file. +So if example config is used, specifying `-q` argument +will make jobs `[A]` and `[B]` use its value. + +Below is a full list of supported parameters with descriptions. + +Param | Default | Description +--------- | ----------------- | ----------- +filename | | Bdevs to use, separated by ":" +cpumask | Maximum available | CPU mask. Format is defined at @ref cpu_mask +bs | | Block size (io size) +iodepth | | Queue depth +rwmixread | `50` | Percentage of a mixed workload that should be reads +offset | `0` | Start I/O at the provided offset on the bdev +length | 100% of bdev size | End I/O at `offset`+`length` on the bdev +rw | | Type of I/O pattern + +Available rw types: +- read +- randread +- write +- randwrite +- verify +- reset +- unmap +- write_zeroes +- flush +- rw +- randrw diff --git a/doc/concurrency.md b/doc/concurrency.md index b0ae7021d60..47009e85d8f 100644 --- a/doc/concurrency.md +++ b/doc/concurrency.md @@ -117,14 +117,10 @@ framework for all of the example applications it shipped with, in the interest of supporting the widest variety of frameworks possible. But the applications do of course require something that implements an asynchronous event loop in order to run, so enter the `event` framework located in `lib/event`. This framework -includes things like spawning one thread per core, pinning each thread to a -unique core, polling and scheduling the lightweight threads, installing signal -handlers to cleanly shutdown, and basic command line option parsing. When -started through spdk_app_start(), the library automatically spawns all of the -threads requested, pins them, and is ready for lightweight threads to be -created. This makes it much easier to implement a brand new SPDK application and -is the recommended method for those starting out. Only established applications -should consider directly integrating the lower level libraries. +includes things like polling and scheduling the lightweight threads, installing +signal handlers to cleanly shutdown, and basic command line option parsing. +Only established applications should consider directly integrating the lower +level libraries. # Limitations of the C Language diff --git a/doc/general.md b/doc/general.md index 4b61540a64e..0710e53545e 100644 --- a/doc/general.md +++ b/doc/general.md @@ -2,4 +2,4 @@ - @subpage event - @subpage logical_volumes -- @subpage vpp_integration +- @subpage accel_fw diff --git a/doc/idxd.md b/doc/idxd.md index 0f9f69329df..8c33770ef3a 100644 --- a/doc/idxd.md +++ b/doc/idxd.md @@ -9,6 +9,10 @@ Function | Description --------------------------------------- | ----------- spdk_idxd_probe() | @copybrief spdk_idxd_probe() +spdk_idxd_batch_get_max() | @copybrief spdk_idxd_batch_get_max() +spdk_idxd_batch_create() | @copybrief spdk_idxd_batch_create() +spdk_idxd_batch_prep_copy() | @copybrief spdk_idxd_batch_prep_copy() +spdk_idxd_batch_submit() | @copybrief spdk_idxd_batch_submit() spdk_idxd_submit_copy() | @copybrief spdk_idxd_submit_copy() spdk_idxd_submit_compare() | @copybrief spdk_idxd_submit_compare() spdk_idxd_submit_crc32c() | @copybrief spdk_idxd_submit_crc32c() diff --git a/doc/img/spdk_top_page1_threads.png b/doc/img/spdk_top_page1_threads.png new file mode 100644 index 00000000000..0dba67035b0 Binary files /dev/null and b/doc/img/spdk_top_page1_threads.png differ diff --git a/doc/img/spdk_top_page2_pollers.png b/doc/img/spdk_top_page2_pollers.png new file mode 100644 index 00000000000..f85bfb7f343 Binary files /dev/null and b/doc/img/spdk_top_page2_pollers.png differ diff --git a/doc/img/spdk_top_page3_cores.png b/doc/img/spdk_top_page3_cores.png new file mode 100644 index 00000000000..01f8e3d050a Binary files /dev/null and b/doc/img/spdk_top_page3_cores.png differ diff --git a/doc/iscsi.md b/doc/iscsi.md index 7d81623d44c..7ba30cd9d83 100644 --- a/doc/iscsi.md +++ b/doc/iscsi.md @@ -324,11 +324,39 @@ At the iSCSI level, we provide the following support for Hotplug: wait for all the commands which have already been submitted to block device to return back; after all the commands return back, the LUN will be deleted. -## Known bugs and limitations {#iscsi_hotplug_bugs} +@sa spdk_nvme_probe -For write command, if you want to test hotplug with write command which will -cause r2t, for example 1M size IO, it will crash the iscsi tgt. -For read command, if you want to test hotplug with large read IO, for example 1M -size IO, it will probably crash the iscsi tgt. +# iSCSI Login Redirection {#iscsi_login_redirection} -@sa spdk_nvme_probe +The SPDK iSCSI target application supports iSCSI login redirection feature. + +A portal refers to an IP address and TCP port number pair, and a portal group +contains a set of portals. Users for the SPDK iSCSI target application configure +portals through portal groups. + +To support login redirection feature, we utilize two types of portal groups, +public portal group and private portal group. + +The SPDK iSCSI target application usually has a discovery portal. The discovery +portal is connected by an initiator to get a list of targets, as well as the list +of portals on which these target may be accessed, by a discovery session. + +Public portal groups have their portals returned by a discovery session. Private +portal groups do not have their portals returned by a discovery session. A public +portal group may optionally have a redirect portal for non-discovery logins for +each associated target. This redirect portal must be from a private portal group. + +Initiators configure portals in public portal groups as target portals. When an +initator logs in to a target through a portal in an associated public portal group, +the target sends a temporary redirection response with a redirect portal. Then the +initiator logs in to the target again through the redirect portal. + +Users set a portal group to public or private at creation using the +`iscsi_create_portal_group` RPC, associate portal groups with a target using the +`iscsi_create_target_node` RPC or the `iscsi_target_node_add_pg_ig_maps` RPC, +specify a up-to-date redirect portal in a public portal group for a target using +the `iscsi_target_node_set_redirect` RPC, and terminate the corresponding connections +by asynchronous logout request using the `iscsi_target_node_request_logout` RPC. + +Typically users will use the login redirection feature in scale out iSCSI target +system, which runs multiple SPDK iSCSI target applications. diff --git a/doc/jsonrpc.md b/doc/jsonrpc.md index 97ec7312723..5b69a680fbd 100644 --- a/doc/jsonrpc.md +++ b/doc/jsonrpc.md @@ -708,6 +708,41 @@ Example response: } ~~~ +## log_enable_timestamps {#rpc_log_enable_timestamps} + +Enable or disable timestamps. + +### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +enabled | Required | boolean | on or off + +### Example + +Example request: + +~~~ +{ + "jsonrpc": "2.0", + "method": "log_enable_timestamps", + "id": 1, + "params": { + "enabled": true + } +} +~~~ + +Example response: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "result": true +} +~~~ + ## thread_get_pollers {#rpc_thread_get_pollers} Retrieve current pollers of all the threads. @@ -1096,6 +1131,145 @@ Example response: } ~~~ +## bdev_compress_create {#rpc_bdev_compress_create} + +Create a new compress bdev on a given base bdev. + +### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +base_bdev_name | Required | string | Name of the base bdev +pm_path | Required | string | Path to persistent memory +lb_size | Optional | int | Compressed vol logical block size (512 or 4096) + +### Result + +Name of newly created bdev. + +### Example + +Example request: + +~~~ +{ + "params": { + "base_bdev_name": "Nvme0n1", + "pm_path": "/pm_files", + "lb_size": 4096 + }, + "jsonrpc": "2.0", + "method": "bdev_compress_create", + "id": 1 +} +~~~ + +## bdev_compress_delete {#rpc_bdev_compress_delete} + +Delete a compressed bdev. + +### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +name | Required | string | Name of the compress bdev + +### Example + +Example request: + +~~~ +{ + "params": { + "name": "COMP_Nvme0n1" + }, + "jsonrpc": "2.0", + "method": "bdev_compress_delete", + "id": 1 +} +~~~ + +Example response: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "result": true +} +~~~ + +## bdev_compress_get_orphans {#rpc_bdev_compress_get_orphans} + +Get a list of compressed volumes that are missing their pmem metadata. + +### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +name | Required | string | Name of the compress bdev + +### Example + +Example request: + +~~~ +{ + "params": { + "name": "COMP_Nvme0n1" + }, + "jsonrpc": "2.0", + "method": "bdev_compress_get_orphans", + "id": 1 +} +~~~ + +Example response: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "name": "COMP_Nvme0n1" +} +~~~ + +## bdev_compress_set_pmd {#rpc_bdev_compress_set_pmd} + +Select the DPDK polled mode driver (pmd) for a compressed bdev, +0 = auto-select, 1= QAT only, 2 = ISAL only. + +### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +pmd | Required | int | pmd selection + +### Example + +Example request: + +~~~ +{ + "params": { + "pmd": 1 + }, + "jsonrpc": "2.0", + "method": "bdev_compress_set_pmd", + "id": 1 +} +~~~ + +Example response: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "result": true +} +~~~ + ## bdev_ocf_create {#rpc_bdev_ocf_create} Construct new OCF bdev. @@ -1523,9 +1697,9 @@ name | Optional | string | Bdev name to use block_size | Required | number | Block size in bytes num_blocks | Required | number | Number of blocks uuid | Optional | string | UUID of new bdev -md_size | Optional | number | Metadata size in bytes -dif_type | Optional | number | Protection information type (0, 1, 2 or 3). Default: 0 - no protection. -dif_is_head_of_md | Optional | boolean | Protection information is in the first 8 bytes of MD. Default: in the last 8 bytes. +md_size | Optional | number | Metadata size for this bdev. Default=0. +dif_type | Optional | number | Protection information type. Parameter --md-size needs to be set along --dif-type. Default=0 - no protection. +dif_is_head_of_md | Optional | boolean | Protection information is in the first 8 bytes of metadata. Default=false. ### Result @@ -1771,7 +1945,10 @@ Example response: ## bdev_nvme_attach_controller {#rpc_bdev_nvme_attach_controller} -Construct @ref bdev_config_nvme +Construct @ref bdev_config_nvme. This RPC can also be used to add additional paths to an existing controller to enable +multipathing. This is done by specifying the `name` parameter as an existing controller. When adding an additional +path, the hostnqn, hostsvcid, hostaddr, prchk_reftag, and prchk_guard_arguments must not be specified and are assumed +to have the same value as the existing path. ### Result @@ -1874,13 +2051,25 @@ Example response: ## bdev_nvme_detach_controller {#rpc_bdev_nvme_detach_controller} -Detach NVMe controller and delete any associated bdevs. +Detach NVMe controller and delete any associated bdevs. Optionally, +If all of the transport ID options are specified, only remove that +transport path from the specified controller. If that is the only +available path for the controller, this will also result in the +controller being detached and the associated bdevs being deleted. + +returns true if the controller and bdevs were successfully destroyed +or the address was properly removed, false otherwise. ### Parameters Name | Optional | Type | Description ----------------------- | -------- | ----------- | ----------- name | Required | string | Controller name +trtype | Optional | string | NVMe-oF target trtype: rdma or tcp +traddr | Optional | string | NVMe-oF target address: ip or BDF +adrfam | Optional | string | NVMe-oF target adrfam: ipv4, ipv6, ib, fc, intra_host +trsvcid | Optional | string | NVMe-oF target trsvcid: port number +subnqn | Optional | string | NVMe-oF target subnqn ### Example @@ -2943,25 +3132,27 @@ This RPC may only be called before SPDK subsystems have been initialized. This R ### Parameters -Name | Optional | Type | Description ---------------------------- | -------- | ------- | ----------- -auth_file | Optional | string | Path to CHAP shared secret file (default: "") -node_base | Optional | string | Prefix of the name of iSCSI target node (default: "iqn.2016-06.io.spdk") -nop_timeout | Optional | number | Timeout in seconds to nop-in request to the initiator (default: 60) -nop_in_interval | Optional | number | Time interval in secs between nop-in requests by the target (default: 30) -disable_chap | Optional | boolean | CHAP for discovery session should be disabled (default: `false`) -require_chap | Optional | boolean | CHAP for discovery session should be required (default: `false`) -mutual_chap | Optional | boolean | CHAP for discovery session should be unidirectional (`false`) or bidirectional (`true`) (default: `false`) -chap_group | Optional | number | CHAP group ID for discovery session (default: 0) -max_sessions | Optional | number | Maximum number of sessions in the host (default: 128) -max_queue_depth | Optional | number | Maximum number of outstanding I/Os per queue (default: 64) -max_connections_per_session | Optional | number | Session specific parameter, MaxConnections (default: 2) -default_time2wait | Optional | number | Session specific parameter, DefaultTime2Wait (default: 2) -default_time2retain | Optional | number | Session specific parameter, DefaultTime2Retain (default: 20) -first_burst_length | Optional | number | Session specific parameter, FirstBurstLength (default: 8192) -immediate_data | Optional | boolean | Session specific parameter, ImmediateData (default: `true`) -error_recovery_level | Optional | number | Session specific parameter, ErrorRecoveryLevel (default: 0) -allow_duplicated_isid | Optional | boolean | Allow duplicated initiator session ID (default: `false`) +Name | Optional | Type | Description +------------------------------- | -------- | ------- | ----------- +auth_file | Optional | string | Path to CHAP shared secret file (default: "") +node_base | Optional | string | Prefix of the name of iSCSI target node (default: "iqn.2016-06.io.spdk") +nop_timeout | Optional | number | Timeout in seconds to nop-in request to the initiator (default: 60) +nop_in_interval | Optional | number | Time interval in secs between nop-in requests by the target (default: 30) +disable_chap | Optional | boolean | CHAP for discovery session should be disabled (default: `false`) +require_chap | Optional | boolean | CHAP for discovery session should be required (default: `false`) +mutual_chap | Optional | boolean | CHAP for discovery session should be unidirectional (`false`) or bidirectional (`true`) (default: `false`) +chap_group | Optional | number | CHAP group ID for discovery session (default: 0) +max_sessions | Optional | number | Maximum number of sessions in the host (default: 128) +max_queue_depth | Optional | number | Maximum number of outstanding I/Os per queue (default: 64) +max_connections_per_session | Optional | number | Session specific parameter, MaxConnections (default: 2) +default_time2wait | Optional | number | Session specific parameter, DefaultTime2Wait (default: 2) +default_time2retain | Optional | number | Session specific parameter, DefaultTime2Retain (default: 20) +first_burst_length | Optional | number | Session specific parameter, FirstBurstLength (default: 8192) +immediate_data | Optional | boolean | Session specific parameter, ImmediateData (default: `true`) +error_recovery_level | Optional | number | Session specific parameter, ErrorRecoveryLevel (default: 0) +allow_duplicated_isid | Optional | boolean | Allow duplicated initiator session ID (default: `false`) +max_large_datain_per_connection | Optional | number | Max number of outstanding split read I/Os per connection (default: 64) +max_r2t_per_connection | Optional | number | Max number of outstanding R2Ts per connection (default: 4) To load CHAP shared secret file, its path is required to specify explicitly in the parameter `auth_file`. @@ -3046,7 +3237,9 @@ Example response: "auth_file": "/usr/local/etc/spdk/auth.conf", "disable_chap": true, "default_time2wait": 2, - "require_chap": false + "require_chap": false, + "max_large_datain_per_connection": 64, + "max_r2t_per_connection": 4 } } ~~~ @@ -3865,7 +4058,8 @@ Example response: "port": "3260" } ], - "tag": 1 + "tag": 1, + "private": false } ] } @@ -3881,6 +4075,7 @@ Name | Optional | Type | Description --------------------------- | -------- | --------| ----------- tag | Required | number | Portal group tag portals | Required | array | Not empty array of portals +private | Optional | boolean | When true, portals in this group are not returned by a discovery session. Used for login redirection. (default: `false`) Portal object @@ -4093,6 +4288,87 @@ Example response: } ~~~ +## iscsi_target_node_set_redirect method {#rpc_iscsi_target_node_set_redirect} + +Update redirect portal of the primary portal group for the target node, + +### Parameters + +Name | Optional | Type | Description +--------------------------- | -------- | --------| ----------- +name | Required | string | Target node name (ASCII) +pg_tag | Required | number | Existing portal group tag +redirect_host | Optional | string | Numeric IP address to which the target node is redirected +redirect_port | Optional | string | Numeric TCP port to which the target node is redirected + +If both redirect_host and redirect_port are omitted, clear the redirect portal. + +### Example + +Example request: + +~~~ +{ + "params": { + "name": "iqn.2016-06.io.spdk:target1", + "pg_tag": 1, + "redirect_host": "10.0.0.3", + "redirect_port": "3260" + }, + "jsonrpc": "2.0", + "method": "iscsi_target_node_set_redirect", + "id": 1 +} +~~~ + +Example response: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "result": true +} +~~~ + +## iscsi_target_node_request_logout method {#rpc_iscsi_target_node_request_logout} + +For the target node, request connections whose portal group tag match to logout, +or request all connections to logout if portal group tag is omitted. + +### Parameters + +Name | Optional | Type | Description +--------------------------- | -------- | --------| ----------- +name | Required | string | Target node name (ASCII) +pg_tag | Optional | number | Existing portal group tag + +### Example + +Example request: + +~~~ +{ + "params": { + "name": "iqn.2016-06.io.spdk:target1", + "pg_tag": 1 + }, + "jsonrpc": "2.0", + "method": "iscsi_target_node_request_logout", + "id": 1 +} +~~~ + +Example response: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "result": true +} +~~~ + # NVMe-oF Target {#jsonrpc_components_nvmf_tgt} ## nvmf_create_transport method {#rpc_nvmf_create_transport} @@ -4119,6 +4395,8 @@ no_srq | Optional | boolean | Disable shared receive queue c2h_success | Optional | boolean | Disable C2H success optimization (TCP only) dif_insert_or_strip | Optional | boolean | Enable DIF insert for write I/O and DIF strip for read I/O DIF sock_priority | Optional | number | The socket priority of the connection owned by this transport (TCP only) +acceptor_backlog | Optional | number | The number of pending connections allowed in backlog before failing new connection attempts (RDMA only) +abort_timeout_sec | Optional | number | Abort execution timeout value, in seconds ### Example @@ -4220,6 +4498,7 @@ serial_number | Optional | string | Serial number of virtual cont model_number | Optional | string | Model number of virtual controller max_namespaces | Optional | number | Maximum number of namespaces that can be attached to the subsystem. Default: 0 (Unlimited) allow_any_host | Optional | boolean | Allow any host (`true`) or enforce allowed host whitelist (`false`). Default: `false`. +ana_reporting | Optional | boolean | Enable ANA reporting feature (default: `false`). ### Example @@ -4337,6 +4616,94 @@ Example response: } ~~~ +## nvmf_subsystem_remove_listener method {#rpc_nvmf_subsystem_remove_listener} + +Remove a listen address from an NVMe-oF subsystem. + +### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +nqn | Required | string | Subsystem NQN +tgt_name | Optional | string | Parent NVMe-oF target name. +listen_address | Required | object | @ref rpc_nvmf_listen_address object + +### Example + +Example request: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "method": "nvmf_subsystem_remove_listener", + "params": { + "nqn": "nqn.2016-06.io.spdk:cnode1", + "listen_address": { + "trtype": "RDMA", + "adrfam": "IPv4", + "traddr": "192.168.0.123", + "trsvcid": "4420" + } + } +} +~~~ + +Example response: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "result": true +} +~~~ + +## nvmf_subsystem_listener_set_ana_state method {#rpc_nvmf_subsystem_listener_set_ana_state} + +Set ANA state of a listener for an NVMe-oF subsystem. + +### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +nqn | Required | string | Subsystem NQN +tgt_name | Optional | string | Parent NVMe-oF target name. +listen_address | Required | object | @ref rpc_nvmf_listen_address object +ana_state | Required | string | ANA state to set ("optimized", "non_optimized", or "inaccessible") + +### Example + +Example request: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "method": "nvmf_subsystem_listener_set_ana_state", + "params": { + "nqn": "nqn.2016-06.io.spdk:cnode1", + "listen_address": { + "trtype": "RDMA", + "adrfam": "IPv4", + "traddr": "192.168.0.123", + "trsvcid": "4420" + }, + "ana_state", "inaccessible" + } +} +~~~ + +Example response: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "result": true +} +~~~ + ## nvmf_subsystem_add_ns method {#rpc_nvmf_subsystem_add_ns} Add a namespace to a subsystem. The namespace ID is returned as the result. @@ -4542,6 +4909,148 @@ Example response: } ~~~ +## nvmf_subsystem_get_controllers {#rpc_nvmf_subsystem_get_controllers} + +### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +nqn | Required | string | Subsystem NQN +tgt_name | Optional | string | Parent NVMe-oF target name. + +### Example + +Example request: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "method": "nvmf_subsystem_get_controllers", + "params": { + "nqn": "nqn.2016-06.io.spdk:cnode1" + } +} +~~~ + +Example response: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "result": [ + { + "cntlid": 1, + "hostnqn": "nqn.2016-06.io.spdk:host1", + "hostid": "27dad528-6368-41c3-82d3-0b956b49025d", + "num_io_qpairs": 5 + } + ] +} +~~~ + +## nvmf_subsystem_get_qpairs {#rpc_nvmf_subsystem_get_qpairs} + +### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +nqn | Required | string | Subsystem NQN +tgt_name | Optional | string | Parent NVMe-oF target name. + +### Example + +Example request: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "method": "nvmf_subsystem_get_qpairs", + "params": { + "nqn": "nqn.2016-06.io.spdk:cnode1" + } +} +~~~ + +Example response: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "result": [ + { + "cntlid": 1, + "qid": 0, + "state": "active", + "listen_address": { + "trtype": "RDMA", + "adrfam": "IPv4", + "traddr": "192.168.0.123", + "trsvcid": "4420" + } + }, + { + "cntlid": 1, + "qid": 1, + "state": "active", + "listen_address": { + "trtype": "RDMA", + "adrfam": "IPv4", + "traddr": "192.168.0.123", + "trsvcid": "4420" + } + } + ] +} +~~~ + +## nvmf_subsystem_get_listeners {#rpc_nvmf_subsystem_get_listeners} + +### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +nqn | Required | string | Subsystem NQN +tgt_name | Optional | string | Parent NVMe-oF target name. + +### Example + +Example request: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "method": "nvmf_subsystem_get_listeners", + "params": { + "nqn": "nqn.2016-06.io.spdk:cnode1" + } +} +~~~ + +Example response: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "result": [ + { + "address": { + "trtype": "RDMA", + "adrfam": "IPv4", + "traddr": "192.168.0.123", + "trsvcid": "4420" + }, + "ana_state": "optimized" + } + ] +} +~~~ + ## nvmf_set_max_subsystems {#rpc_nvmf_set_max_subsystems} Set the maximum allowed subsystems for the NVMe-oF target. This RPC may only be called @@ -4654,7 +5163,8 @@ Example response: "max_io_qpairs_per_ctrlr": 64, "in_capsule_data_size": 4096, "max_io_size": 131072, - "io_unit_size": 131072 + "io_unit_size": 131072, + "abort_timeout_sec": 1 } ] } @@ -5795,7 +6305,7 @@ Name | Optional | Type | Description ----------------------- | -------- | ----------- | ----------- name | Required | string | RAID bdev name strip_size_kb | Required | number | Strip size in KB -raid_level | Required | number | RAID level +raid_level | Required | string | RAID level base_bdevs | Required | string | Base bdevs name, whitespace separated list in quotes ### Example @@ -5809,7 +6319,7 @@ Example request: "id": 1, "params": { "name": "Raid0", - "raid_level": 0, + "raid_level": "0", "base_bdevs": [ "Malloc0", "Malloc1", @@ -6543,6 +7053,103 @@ Example response: } ~~~ +# Socket layer {#jsonrpc_components_sock} + +## sock_impl_get_options {#rpc_sock_impl_get_options} + +Get parameters for the socket layer implementation. + +### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +impl_name | Required | string | Name of socket implementation, e.g. posix + +### Response + +Response is an object with current socket layer options for requested implementation. + +### Example + +Example request: + +~~~ +{ + "jsonrpc": "2.0", + "method": "sock_impl_get_options", + "id": 1, + "params": { + "impl_name": "posix" + } +} +~~~ + +Example response: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "result": { + "recv_buf_size": 2097152, + "send_buf_size": 2097152, + "enable_recv_pipe": true + "enable_zerocopy_send": true + } +} +~~~ + +## sock_impl_set_options {#rpc_sock_impl_set_options} + +Set parameters for the socket layer implementation. + +### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +impl_name | Required | string | Name of socket implementation, e.g. posix +recv_buf_size | Optional | number | Size of socket receive buffer in bytes +send_buf_size | Optional | number | Size of socket send buffer in bytes +enable_recv_pipe | Optional | boolean | Enable or disable receive pipe +enable_zerocopy_send | Optional | boolean | Enable or disable zero copy on send +enable_quick_ack | Optional | boolean | Enable or disable quick ACK +enable_placement_id | Optional | boolean | Enable or disable placement_id + +### Response + +True if socket layer options were set successfully. + +### Example + +Example request: + +~~~ +{ + "jsonrpc": "2.0", + "method": "sock_impl_set_options", + "id": 1, + "params": { + "impl_name": "posix", + "recv_buf_size": 2097152, + "send_buf_size": 2097152, + "enable_recv_pipe": false, + "enable_zerocopy_send": true, + "enable_quick_ack": false, + "enable_placement_id": false + } +} +~~~ + +Example response: + +~~~ +{ + "jsonrpc": "2.0", + "id": 1, + "result": true +} +~~~ + # Miscellaneous RPC commands ## bdev_nvme_send_cmd {#rpc_bdev_nvme_send_cmd} diff --git a/doc/libraries.md b/doc/libraries.md index 6bdc0106b48..a1c4dc36dc6 100644 --- a/doc/libraries.md +++ b/doc/libraries.md @@ -14,7 +14,7 @@ of the libraries contained in these two directories. The SPDK libraries are divided into two directories. The `lib` directory contains the base libraries that compose SPDK. Some of these base libraries define plug-in systems. Instances of those plug-ins are called modules and are located in the `module` directory. For example, the `spdk_sock` library is contained in the -`lib` directory while the implementations of socket abstractions, `sock_posix`, `sock_uring`, and `sock_vpp` +`lib` directory while the implementations of socket abstractions, `sock_posix` and `sock_uring` are contained in the `module` directory. ## lib {#lib} diff --git a/doc/nvmf.md b/doc/nvmf.md index 34a69b66faf..25b0697d36a 100644 --- a/doc/nvmf.md +++ b/doc/nvmf.md @@ -158,8 +158,8 @@ and an in capsule data size of 0 bytes. The TCP transport is configured with an ~~~{.sh} build/bin/nvmf_tgt -scripts/rpc.py nvmf_create_transport -t RDMA -u 8192 -p 4 -c 0 -scripts/rpc.py nvmf_create_transport -t TCP -u 16384 -p 8 -c 8192 +scripts/rpc.py nvmf_create_transport -t RDMA -u 8192 -m 4 -c 0 +scripts/rpc.py nvmf_create_transport -t TCP -u 16384 -m 8 -c 8192 ~~~ Below is an example of creating a malloc bdev and assigning it to a subsystem. Adjust the bdevs, diff --git a/doc/performance_reports.md b/doc/performance_reports.md index 08d15e4d26b..3c80a2330b9 100644 --- a/doc/performance_reports.md +++ b/doc/performance_reports.md @@ -1,8 +1,16 @@ # Performance Reports {#performance_reports} +## Release 20.07 + +- [SPDK 20.07 NVMe-oF TCP Performance Report](https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2007.pdf) +- [SPDK 20.07 NVMe-oF RDMA Performance Report](https://ci.spdk.io/download/performance-reports/SPDK_rdma_perf_report_2007.pdf) +- [SPDK 20.07 Vhost Performance Report](https://ci.spdk.io/download/performance-reports/SPDK_vhost_perf_report_2007.pdf) + ## Release 20.04 - [SPDK 20.04 NVMe-oF TCP Performance Report](https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2004.pdf) +- [SPDK 20.04 NVMe-oF RDMA Performance Report](https://ci.spdk.io/download/performance-reports/SPDK_rdma_perf_report_2004.pdf) +- [SPDK 20.04 Vhost Performance Report](https://ci.spdk.io/download/performance-reports/SPDK_vhost_perf_report_2004.pdf) ## Release 20.01 diff --git a/doc/spdk_top.md b/doc/spdk_top.md new file mode 100644 index 00000000000..c9ef0de4e8f --- /dev/null +++ b/doc/spdk_top.md @@ -0,0 +1,45 @@ +# spdk_top {#spdk_top} + +The spdk_top application is designed to resemble the standard top in that it provides a real-time insights into CPU cores usage by SPDK lightweight threads and pollers. Have you ever wondered which CPU core is used most by your SPDK instance? Are you building your own bdev or library and want to know if your code is running efficiently? Are your new pollers busy most of the time? The spdk_top application uses RPC calls to collect performance metrics and displays them in a report that you can analyze and determine if your code is running efficiently so that you can tune your implementation and get more from SPDK. + +Why doesn't the classic top utility work for SPDK? SPDK uses a polled-mode design; a reactor thread running on each CPU core assigned to an SPDK application schedules SPDK lightweight threads and pollers to run on the CPU core. Therefore, the standard Linux top utility is not effective for analyzing the CPU usage for polled-mode applications like SPDK because it just reports that they are using 100% of the CPU resources assigned to them. The spdk_top utility was developed to analyze and report the CPU cycles used to do real work vs just polling for work. The utility relies on instrumentation added to pollers to track when they are doing work vs. polling for work. The spdk_top utility gets the fine grained metrics from the pollers, analyzes and report the metrics on a per poller, thread and core basis. This information enables users to identify CPU cores that are busy doing real work so that they can determine if the application needs more or less CPU resources. + +# Run spdk_top +Before running spdk_top you need to run the SPDK application whose performance you want to analyze using spdk_top. For example, the nvmf_tgt application was running when we used the spdk_top to capture the screen shots in this documentation. + +Run the spdk_top application + +~~~{.sh} +./build/bin/spdk_top +~~~ + +The spdk_top application has 3 tabs: the cores, threads and pollers tabs. + +# Threads Tab +The threads tab displays a line item for each spdk thread that includes information such as which CPU core the spdk thread is running on, how many pollers the thread is running and how many microseconds was the thread busy/idle. The pollers are grouped into active, timed and pause pollers. To learn more about spdk threads see @ref concurrency. + +![Threads Tab](img/spdk_top_page1_threads.png) + +# Pollers Tab +The pollers tab displays a line item for each poller and a running counter of the number of times the poller has run so that you can see which pollers are running most frequently. + +![Pollers Tab](img/spdk_top_page2_pollers.png) + +# Cores Tab +The cores tab provides insights into how the application is using the CPU cores assigned to it. +It has a line item for each CPU core assigned to the application which shows the number of threads and poller +running on the CPU core. The tab also indicates how busy/idle the each CPU core was in the last 1 second. +The busy column displays how many microseconds the CPU core was doing actual work in the last 1 second. +The idle column displays how many microseconds the CPU core was idle in the last 1 second, +including the time when the CPU core ran pollers but did not find any work. + +![Cores Tab](img/spdk_top_page3_cores.png) + +# Refresh Rate +You can control how often the spdk_top application refreshes the data displayed by hitting the 'r' key on your keyboard and specifying a value between 0 and 255 seconds. + +# Sorting +You can sort the data displayed by hitting the 's' key on your keyboard and selecting a column to sort by in the sub menu that is displayed. + +# Filtering +You can filter out any column by hitting the 'c' key on your keyboard and unselecting the column in the menu that is displayed. diff --git a/doc/tools.md b/doc/tools.md index 6fa7faefe19..ecf7d9fe592 100644 --- a/doc/tools.md +++ b/doc/tools.md @@ -2,3 +2,5 @@ - @subpage spdkcli - @subpage nvme-cli +- @subpage bdevperf +- @subpage spdk_top diff --git a/doc/vpp_integration.md b/doc/vpp_integration.md deleted file mode 100644 index 3b09e52430e..00000000000 --- a/doc/vpp_integration.md +++ /dev/null @@ -1,237 +0,0 @@ -# Vector Packet Processing {#vpp_integration} - -VPP (part of [Fast Data - Input/Output](https://fd.io/) project) is an extensible -userspace framework providing networking functionality. It is built around the concept of -packet processing graph (see [What is VPP?](https://wiki.fd.io/view/VPP/What_is_VPP?)). - -Detailed instructions for **simplified steps 1-3** below, can be found on -VPP [Quick Start Guide](https://wiki.fd.io/view/VPP). - -*SPDK supports VPP version 19.04.2.* - -# 1. Building VPP (optional) {#vpp_build} - -*Please skip this step if using already built packages.* - -Clone and checkout VPP -~~~ -git clone https://gerrit.fd.io/r/vpp && cd vpp -git checkout v19.04.2 -~~~ - -Install VPP build dependencies -~~~ -make install-dep -~~~ - -Build and create .rpm packages -~~~ -make pkg-rpm -~~~ - -Alternatively, build and create .deb packages -~~~ -make bootstrap && make pkg-deb -~~~ - -Packages can be found in `vpp/build-root/` directory. - -For more in depth instructions please see Building section in -[VPP documentation](https://wiki.fd.io/view/VPP/Pulling,_Building,_Running,_Hacking_and_Pushing_VPP_Code#Building) - -# 2. Installing VPP {#vpp_install} - -Packages can be installed from a distribution repository or built in previous step. -Minimal set of packages consists of `vpp`, `vpp-lib` and `vpp-devel`. - -*Note: Please remove or modify /etc/sysctl.d/80-vpp.conf file with appropriate values -dependent on number of hugepages that will be used on system.* - -# 3. Running VPP {#vpp_run} - -VPP takes over any network interfaces that were bound to userspace driver, -for details please see DPDK guide on -[Binding and Unbinding Network Ports to/from the Kernel Modules](http://dpdk.org/doc/guides/linux_gsg/linux_drivers.html#binding-and-unbinding-network-ports-to-from-the-kernel-modules). - -VPP is installed as service and disabled by default. To start VPP with default config: -~~~ -sudo systemctl start vpp -~~~ - -Alternatively, use `vpp` binary directly -~~~ -sudo vpp unix {cli-listen /run/vpp/cli.sock} session { evt_qs_memfd_seg } socksvr { socket-name /run/vpp-api.sock } -~~~ - -# 4. Configure VPP {#vpp_config} - -VPP can be configured using a VPP startup file and the `vppctl` command; By default, the VPP startup file is `/etc/vpp/startup.conf`, however, you can pass any file with the `-c` vpp command argument. - -## Startup configuration - -Some key values from iSCSI point of view includes: - -CPU section (`cpu`): - -- `main-core ` -- logical CPU core used for main thread. -- `corelist-workers ` -- logical CPU cores where worker threads are running. - -DPDK section (`dpdk`): - -- `num-rx-queues ` -- number of receive queues. -- `num-tx-queues ` -- number of transmit queues. -- `dev ` -- whitelisted device. - -Session section (`session`): - -- `evt_qs_memfd_seg` -- uses a memfd segment for event queues. This is required for SPDK. - -Socket server session (`socksvr`): - -- `socket-name ` -- configure API socket filename (curently SPDK uses default path `/run/vpp-api.sock`). - -Plugins section (`plugins`): - -- `plugin { [enable|disable] }` -- enable or disable VPP plugin. - -### Example - -~~~ -unix { - nodaemon - cli-listen /run/vpp/cli.sock -} -cpu { - main-core 1 -} -session { - evt_qs_memfd_seg -} -socksvr { - socket-name /run/vpp-api.sock -} -plugins { - plugin default { disable } - plugin dpdk_plugin.so { enable } -} -~~~ - -## vppctl command tool - -The `vppctl` command tool allows users to control VPP at runtime via a command prompt -~~~ -sudo vppctl -~~~ - -Or, by sending single command directly. For example to display interfaces within VPP: -~~~ -sudo vppctl show interface -~~~ - -Useful commands: - -- `show interface` -- show interfaces settings, state and some basic statistics. -- `show interface address` -- show interfaces state and assigned addresses. - -- `set interface ip address
` -- set interfaces IP address. -- `set interface state [up|down]` -- bring interface up or down. - -- `show errors` -- show error counts. - -## Example: Configure two interfaces to be available via VPP - -We want to configure two DPDK ports with PCI addresses 0000:09:00.1 and 0000:0b:00.1 -to be used as portals 10.0.0.1/24 and 10.10.0.1/24. - -In the VPP startup file (e.g. `/etc/vpp/startup.conf`), whitelist the interfaces -by specifying PCI addresses in section dpdk: -~~~ - dev 0000:09:00.1 - dev 0000:0b:00.1 -~~~ - -Bind PCI NICs to UIO driver (`igb_uio` or `uio_pci_generic`). - -Restart vpp and use vppctl tool to verify interfaces. -~~~ -$ vppctl show interface - Name Idx State MTU (L3/IP4/IP6/MPLS) Counter Count - -FortyGigabitEthernet9/0/1 1 down 9000/0/0/0 -FortyGigabitEthernetb/0/1 2 down 9000/0/0/0 -~~~ - -Set appropriate addresses and bring interfaces up: -~~~ -$ vppctl set interface ip address FortyGigabitEthernet9/0/1 10.0.0.1/24 -$ vppctl set interface state FortyGigabitEthernet9/0/1 up -$ vppctl set interface ip address FortyGigabitEthernetb/0/1 10.10.0.1/24 -$ vppctl set interface state FortyGigabitEthernetb/0/1 up -~~~ - -Verify configuration: -~~~ -$ vppctl show interface address -FortyGigabitEthernet9/0/1 (up): - L3 10.0.0.1/24 -FortyGigabitEthernetb/0/1 (up): - L3 10.10.0.1/24 -~~~ - -Now, both interfaces are ready to use. To verify conectivity you can ping -10.0.0.1 and 10.10.0.1 addresses from another machine. - -## Example: Tap interfaces on single host - -For functional test purposes a virtual tap interface can be created, -so no additional network hardware is required. -This will allow network communication between SPDK iSCSI target using VPP end of tap -and kernel iSCSI initiator using the kernel part of tap. A single host is used in this scenario. - -Create tap interface via VPP -~~~ - vppctl tap connect tap0 - vppctl set interface state tapcli-0 up - vppctl set interface ip address tapcli-0 10.0.0.1/24 - vppctl show int addr -~~~ - -Assign address on kernel interface -~~~ - sudo ip addr add 10.0.0.2/24 dev tap0 - sudo ip link set tap0 up -~~~ - -To verify connectivity -~~~ - ping 10.0.0.1 -~~~ - -# 5. Building SPDK with VPP {#vpp_built_into_spdk} - -Support for VPP can be built into SPDK by using configuration option. -~~~ -configure --with-vpp -~~~ - -Alternatively, directory with built libraries can be pointed at -and will be used for compilation instead of installed packages. -~~~ -configure --with-vpp=/path/to/vpp/repo/build-root/install-vpp-native/vpp -~~~ - -# 6. Running SPDK with VPP {#vpp_running_with_spdk} - -VPP application has to be started before SPDK application, in order to enable -usage of network interfaces. For example, if you use SPDK iSCSI target or -NVMe-oF target, after the initialization finishes, interfaces configured within -VPP will be available to be configured as portal addresses. - -Moreover, you do not need to specifiy which TCP sock implementation (e.g., posix, -VPP) to be used through configuration file or RPC call. Since SPDK program -automatically determines the protocol according to the configured portal addresses -info. For example, you can specify a Listen address in NVMe-oF subsystem -configuration such as "Listen TCP 10.0.0.1:4420". SPDK programs automatically -uses different implemenation to listen this provided portal info via posix or -vpp implemenation(if compiled in SPDK program), and only one implementation can -successfully listen on the provided portal. diff --git a/dpdk b/dpdk index ef71bfaface..64f1ced13f9 160000 --- a/dpdk +++ b/dpdk @@ -1 +1 @@ -Subproject commit ef71bfaface10cc19b75e45d3158ab71a788e3a9 +Subproject commit 64f1ced13f974e8b3d46b87c361a09eca68126f9 diff --git a/dpdkbuild/Makefile b/dpdkbuild/Makefile index cd9a7082d1b..9568f263c74 100644 --- a/dpdkbuild/Makefile +++ b/dpdkbuild/Makefile @@ -36,87 +36,44 @@ include $(SPDK_ROOT_DIR)/mk/spdk.common.mk .PHONY: all clean install uninstall -DPDK_FRAMEWORK = n -DPDK_OPTS = +DPDK_OPTS = -Denable_docs=false DPDK_CFLAGS = -DPDK_OPTS += CONFIG_RTE_BUILD_SHARED_LIB=n -DPDK_OPTS += CONFIG_RTE_LIBRTE_PMD_AESNI_MB=n -DPDK_OPTS += CONFIG_RTE_LIBRTE_REORDER=n -DPDK_OPTS += CONFIG_RTE_LIBRTE_ETHER=n -DPDK_OPTS += CONFIG_RTE_LIBRTE_CMDLINE=n -DPDK_OPTS += CONFIG_RTE_LIBRTE_METER=n -DPDK_OPTS += CONFIG_RTE_LIBRTE_HASH=n -DPDK_OPTS += CONFIG_RTE_LIBRTE_VHOST=n -DPDK_OPTS += CONFIG_RTE_EAL_IGB_UIO=n -DPDK_OPTS += CONFIG_RTE_LIBRTE_PMD_QAT=n -DPDK_OPTS += CONFIG_RTE_LIBRTE_PMD_QAT_SYM=n -DPDK_OPTS += CONFIG_RTE_LIBRTE_PMD_ISAL=n - -ifeq ($(CONFIG_SHARED),y) -DPDK_OPTS += CONFIG_RTE_BUILD_SHARED_LIB=y -DPDK_LDFLAGS+= -rpath $(SPDK_ROOT_DIR)/dpdk/build/lib -endif -ifeq ($(CONFIG_CRYPTO),y) -DPDK_FRAMEWORK = y -DPDK_OPTS += CONFIG_RTE_LIBRTE_PMD_AESNI_MB=y -DPDK_OPTS += CONFIG_RTE_LIBRTE_REORDER=y -DPDK_CFLAGS += -I$(IPSEC_MB_DIR) -DPDK_LDFLAGS += -L$(IPSEC_MB_DIR) +DPDK_KMODS = false +ifeq ($(CONFIG_IGB_UIO_DRIVER),y) +DPDK_KMODS = true endif - -ifeq ($(CONFIG_REDUCE),y) -DPDK_FRAMEWORK = y -DPDK_OPTS += CONFIG_RTE_LIBRTE_PMD_ISAL=y -DPDK_CFLAGS += -I$(ISAL_DIR) -DPDK_LDFLAGS += -L$(ISAL_DIR)/.libs +ifeq ($(OS),FreeBSD) +DPDK_KMODS = true endif +DPDK_OPTS += -Denable_kmods=$(DPDK_KMODS) -ifeq ($(CONFIG_VHOST),y) -DPDK_OPTS += CONFIG_RTE_LIBRTE_ETHER=y -DPDK_OPTS += CONFIG_RTE_LIBRTE_CMDLINE=y -DPDK_OPTS += CONFIG_RTE_LIBRTE_METER=y -DPDK_OPTS += CONFIG_RTE_LIBRTE_HASH=y -DPDK_OPTS += CONFIG_RTE_LIBRTE_VHOST=y -endif +# the drivers we use +DPDK_DRIVERS = bus bus/pci bus/vdev mempool/ring -ifeq ($(CONFIG_IGB_UIO_DRIVER),y) -DPDK_OPTS += CONFIG_RTE_EAL_IGB_UIO=y +# common crypto/reduce drivers +ifeq ($(findstring y,$(CONFIG_CRYPTO)$(CONFIG_REDUCE)),y) +DPDK_DRIVERS += crypto/qat compress/qat common/qat endif -ifeq ($(CONFIG_RAID5),y) -DPDK_OPTS += CONFIG_RTE_LIBRTE_HASH=y +ifeq ($(CONFIG_CRYPTO),y) +# crypto/qat is just a stub, the compress/qat pmd is used instead +DPDK_DRIVERS += crypto crypto/aesni_mb +DPDK_CFLAGS += -I$(IPSEC_MB_DIR) +DPDK_LDFLAGS += -L$(IPSEC_MB_DIR) endif -ifeq ($(DPDK_FRAMEWORK),y) -DPDK_OPTS += CONFIG_RTE_LIBRTE_PMD_QAT=y -DPDK_OPTS += CONFIG_RTE_LIBRTE_PMD_QAT_SYM=y +ifeq ($(CONFIG_REDUCE),y) +DPDK_DRIVERS += compress compress/isal +DPDK_CFLAGS += -I$(ISAL_DIR) +DPDK_LDFLAGS += -L$(ISAL_DIR)/.libs -lisal endif -ifeq ($(TARGET_MACHINE),aarch64) -DPDK_CONFIG := arm64-armv8a -else -DPDK_CONFIG := $(TARGET_MACHINE)-native -endif +DPDK_OPTS += -Dmachine=$(TARGET_ARCHITECTURE) ifneq ($(CONFIG_CROSS_PREFIX),) -DPDK_OPTS += CROSS=$(CONFIG_CROSS_PREFIX)- -endif - -ifeq ($(OS),Linux) -DPDK_CONFIG := $(DPDK_CONFIG)-linuxapp -NPROC := $(shell nproc) -else -ifeq ($(OS),FreeBSD) -DPDK_CONFIG := $(DPDK_CONFIG)-bsdapp -NPROC := $(shell sysctl hw.ncpu | awk '{print $$NF}') -endif -endif - -ifeq ($(CC_TYPE),clang) -DPDK_CONFIG := $(DPDK_CONFIG)-clang -else -DPDK_CONFIG := $(DPDK_CONFIG)-gcc +$(error Automatic DPDK cross build is not supported. Please compile DPDK manually \ +with e.g. `meson build --cross-file config/arm/arm64_armv8_linux_gcc`) endif DPDK_CFLAGS += -fPIC @@ -139,18 +96,73 @@ endif # Allow users to specify EXTRA_DPDK_CFLAGS if they want to build DPDK using unsupported compiler versions DPDK_CFLAGS += $(EXTRA_DPDK_CFLAGS) +ifeq ($(CC_TYPE),gcc) +GCC_MAJOR = $(shell echo __GNUC__ | $(CC) -E -x c - | tail -n 1) +ifeq ($(shell test $(GCC_MAJOR) -ge 10 && echo 1), 1) +#1. gcc 10 complains on operations with zero size arrays in rte_cryptodev.c, so +#disable this warning +#2. gcc 10 disables fcommon by default and complains on multiple definition of +#aesni_mb_logtype_driver symbol which is defined in header file and presented in sevral +#translation units +DPDK_CFLAGS += -Wno-stringop-overflow -fcommon +endif +endif + # Force-disable scan-build SUB_CC = $(patsubst %ccc-analyzer,$(DEFAULT_CC),$(CC)) -$(SPDK_ROOT_DIR)/dpdk/build: $(SPDK_ROOT_DIR)/mk/cc.mk $(SPDK_ROOT_DIR)/include/spdk/config.h - $(Q)rm -rf $(SPDK_ROOT_DIR)/dpdk/build - $(Q)$(MAKE) -C $(SPDK_ROOT_DIR)/dpdk config T=$(DPDK_CONFIG) $(DPDK_OPTS) +DPDK_ALL_DRIVER_DIRS = $(shell find $(SPDK_ROOT_DIR)/dpdk/drivers -mindepth 1 -type d) +DPDK_ALL_DRIVERS = $(DPDK_ALL_DRIVER_DIRS:$(SPDK_ROOT_DIR)/dpdk/drivers/%=%) +DPDK_DISABLED_DRVERS = $(filter-out $(DPDK_DRIVERS),$(DPDK_ALL_DRIVERS)) + +ifeq ($(OS),Linux) +SED_INPLACE_FLAG = "-i" +MESON_PREFIX = $(SPDK_ROOT_DIR)/dpdk/build +else +SED_INPLACE_FLAG = "-i ''" +MESON_PREFIX = "/" +endif + +# Some ninja versions come with a (broken?) jobserver which defaults to use +# only 1 thread for the build. We workaround this by specifying -j to ninja +# with the same value as top-makefile. This is OK as long as DPDK is not built +# in parralel with anything else, which is the case for now. +ifeq ($(MAKE_PID),) +MAKE_PID := $(shell echo $$PPID) +endif -all: $(SPDK_ROOT_DIR)/dpdk/build - $(Q)$(MAKE) -C $(SPDK_ROOT_DIR)/dpdk/build EXTRA_CFLAGS="$(DPDK_CFLAGS)" EXTRA_LDFLAGS="$(DPDK_LDFLAGS)" CC=$(SUB_CC) T="$(DPDK_CONFIG)" $(DPDK_OPTS) +MAKE_NUMJOBS := $(shell ps T | sed -nE 's/\s*$(MAKE_PID)\s.* (-j|--jobs=)( *[0-9]+).*/\1\2/p') + +all: $(SPDK_ROOT_DIR)/dpdk/build-tmp + $(Q)# DPDK doesn't handle nested make calls, so unset MAKEFLAGS + $(Q)env -u MAKEFLAGS ninja -C $(SPDK_ROOT_DIR)/dpdk/build-tmp $(MAKE_NUMJOBS) + $(Q) \ + # Meson on FreeBSD sometimes appends --prefix value to the default DESTDIR (which is e.g. \ + # /usr/local) instead of replacing it. --prefix needs to be an absolute path, so we set \ + # it to / and then set DESTDIR directly, so libs and headers are copied to "DESTDIR//". \ + # DPDK kernel modules are set to install in $DESTDIR/boot/modules, but we move them \ + # to DESTDIR/kmod to be consistent with the makefile build. + $(Q)if [ "$(OS)" = "FreeBSD" ]; then \ + env -u MAKEFLAGS DESTDIR=$(SPDK_ROOT_DIR)/dpdk/build ninja -C $(SPDK_ROOT_DIR)/dpdk/build-tmp $(MAKE_NUMJOBS) install > /dev/null && \ + mv $(SPDK_ROOT_DIR)/dpdk/build/boot/modules $(SPDK_ROOT_DIR)/dpdk/build/kmod; \ + else \ + env -u MAKEFLAGS ninja -C $(SPDK_ROOT_DIR)/dpdk/build-tmp $(MAKE_NUMJOBS) install > /dev/null; \ + fi + +$(SPDK_ROOT_DIR)/dpdk/build-tmp: $(SPDK_ROOT_DIR)/mk/cc.mk $(SPDK_ROOT_DIR)/include/spdk/config.h + $(Q)rm -rf $(SPDK_ROOT_DIR)/dpdk/build $(SPDK_ROOT_DIR)/dpdk/build-tmp + $(Q)cd "$(SPDK_ROOT_DIR)/dpdk"; CC="$(SUB_CC)" meson --prefix="$(MESON_PREFIX)" --libdir lib -Dc_args="$(DPDK_CFLAGS)" -Dc_link_args="$(DPDK_LDFLAGS)" $(DPDK_OPTS) -Ddisable_drivers="$(shell echo $(DPDK_DISABLED_DRVERS) | sed -E "s/ +/,/g")" build-tmp + $(Q)sed $(SED_INPLACE_FLAG) 's/#define RTE_EAL_PMD_PATH .*/#define RTE_EAL_PMD_PATH ""/g' $(SPDK_ROOT_DIR)/dpdk/build-tmp/rte_build_config.h + $(Q) \ + # TODO Meson build adds libbsd dependency when it's available. This means any app will be \ + # forced to link with -lbsd, but only if it's available on the system. The clean way to \ + # handle this would be to rely on DPDK's pkg-config file which will contain the -lbsd when \ + # required. For now just remove the libbsd dependency. DPDK will fallback to its internal \ + # functions. + $(Q)sed $(SED_INPLACE_FLAG) 's/#define RTE_USE_LIBBSD .*//g' $(SPDK_ROOT_DIR)/dpdk/build-tmp/rte_build_config.h clean: - $(Q)rm -rf $(SPDK_ROOT_DIR)/dpdk/build + $(Q)rm -rf $(SPDK_ROOT_DIR)/dpdk/build $(SPDK_ROOT_DIR)/dpdk/build-tmp install: @: diff --git a/examples/accel/perf/Makefile b/examples/accel/perf/Makefile index 78de87892f1..b28f7c412aa 100644 --- a/examples/accel/perf/Makefile +++ b/examples/accel/perf/Makefile @@ -40,7 +40,7 @@ APP = accel_perf C_SRCS := accel_perf.c SPDK_LIB_LIST = $(ALL_MODULES_LIST) -SPDK_LIB_LIST += event_bdev event_accel event_vmd +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) SPDK_LIB_LIST += bdev accel event thread util conf trace \ log jsonrpc json rpc sock notify diff --git a/examples/accel/perf/accel_perf.c b/examples/accel/perf/accel_perf.c index 1b386054fe2..2093253c1d1 100644 --- a/examples/accel/perf/accel_perf.c +++ b/examples/accel/perf/accel_perf.c @@ -39,6 +39,7 @@ #include "spdk/string.h" #include "spdk/accel_engine.h" #include "spdk/crc32.h" +#include "spdk/util.h" #define DATA_PATTERN 0x5a #define ALIGN_4K 0x1000 @@ -58,6 +59,8 @@ static enum accel_capability g_workload_selection; static struct worker_thread *g_workers = NULL; static int g_num_workers = 0; static pthread_mutex_t g_workers_lock = PTHREAD_MUTEX_INITIALIZER; +uint64_t g_capabilites; +struct ap_task; struct worker_thread { struct spdk_io_channel *ch; @@ -65,7 +68,7 @@ struct worker_thread { uint64_t xfer_failed; uint64_t injected_miscompares; uint64_t current_queue_depth; - struct spdk_mempool *task_pool; + TAILQ_HEAD(, ap_task) tasks; struct worker_thread *next; unsigned core; struct spdk_thread *thread; @@ -81,20 +84,9 @@ struct ap_task { struct worker_thread *worker; int status; int expected_status; /* used for compare */ + TAILQ_ENTRY(ap_task) link; }; -inline static struct ap_task * -__ap_task_from_accel_task(struct spdk_accel_task *at) -{ - return (struct ap_task *)((uintptr_t)at - sizeof(struct ap_task)); -} - -inline static struct spdk_accel_task * -__accel_task_from_ap_task(struct ap_task *ap) -{ - return (struct spdk_accel_task *)((uintptr_t)ap + sizeof(struct ap_task)); -} - static void dump_user_config(struct spdk_app_opts *opts) { @@ -181,8 +173,13 @@ static void unregister_worker(void *arg1) { struct worker_thread *worker = arg1; + struct ap_task *task; - spdk_mempool_free(worker->task_pool); + while (!TAILQ_EMPTY(&worker->tasks)) { + task = TAILQ_FIRST(&worker->tasks); + TAILQ_REMOVE(&worker->tasks, task, link); + free(task); + } spdk_put_io_channel(worker->ch); pthread_mutex_lock(&g_workers_lock); assert(g_num_workers >= 1); @@ -209,20 +206,18 @@ _submit_single(void *arg1, void *arg2) task->worker->current_queue_depth++; switch (g_workload_selection) { case ACCEL_COPY: - rc = spdk_accel_submit_copy(__accel_task_from_ap_task(task), - worker->ch, task->dst, - task->src, g_xfer_size_bytes, accel_done); + rc = spdk_accel_submit_copy(worker->ch, task->dst, task->src, + g_xfer_size_bytes, accel_done, task); break; case ACCEL_FILL: /* For fill use the first byte of the task->dst buffer */ - rc = spdk_accel_submit_fill(__accel_task_from_ap_task(task), - worker->ch, task->dst, *(uint8_t *)task->src, - g_xfer_size_bytes, accel_done); + rc = spdk_accel_submit_fill(worker->ch, task->dst, *(uint8_t *)task->src, + g_xfer_size_bytes, accel_done, task); break; case ACCEL_CRC32C: - rc = spdk_accel_submit_crc32c(__accel_task_from_ap_task(task), - worker->ch, (uint32_t *)task->dst, task->src, g_crc32c_seed, - g_xfer_size_bytes, accel_done); + rc = spdk_accel_submit_crc32c(worker->ch, (uint32_t *)task->dst, + task->src, g_crc32c_seed, + g_xfer_size_bytes, accel_done, task); break; case ACCEL_COMPARE: random_num = rand() % 100; @@ -233,14 +228,12 @@ _submit_single(void *arg1, void *arg2) task->expected_status = 0; *(uint8_t *)task->dst = DATA_PATTERN; } - rc = spdk_accel_submit_compare(__accel_task_from_ap_task(task), - worker->ch, task->dst, task->src, - g_xfer_size_bytes, accel_done); + rc = spdk_accel_submit_compare(worker->ch, task->dst, task->src, + g_xfer_size_bytes, accel_done, task); break; case ACCEL_DUALCAST: - rc = spdk_accel_submit_dualcast(__accel_task_from_ap_task(task), - worker->ch, task->dst, task->dst2, - task->src, g_xfer_size_bytes, accel_done); + rc = spdk_accel_submit_dualcast(worker->ch, task->dst, task->dst2, + task->src, g_xfer_size_bytes, accel_done, task); break; default: assert(false); @@ -249,7 +242,7 @@ _submit_single(void *arg1, void *arg2) } if (rc) { - accel_done(__accel_task_from_ap_task(task), rc); + accel_done(task, rc); } } @@ -289,6 +282,14 @@ _accel_done(void *arg1) worker->xfer_failed++; } break; + case ACCEL_FILL: + if (memcmp(task->dst, task->src, g_xfer_size_bytes)) { + SPDK_NOTICELOG("Data miscompare\n"); + worker->xfer_failed++; + } + break; + case ACCEL_COMPARE: + break; default: assert(false); break; @@ -314,10 +315,20 @@ _accel_done(void *arg1) if (g_workload_selection == ACCEL_DUALCAST) { spdk_free(task->dst2); } - spdk_mempool_put(worker->task_pool, task); + TAILQ_INSERT_TAIL(&worker->tasks, task, link); } } +static void +batch_done(void *cb_arg, int status) +{ + struct ap_task *task = (struct ap_task *)cb_arg; + struct worker_thread *worker = task->worker; + + worker->current_queue_depth--; + TAILQ_INSERT_TAIL(&worker->tasks, task, link); +} + static int dump_result(void) { @@ -395,14 +406,98 @@ _init_thread_done(void *ctx) { } +static int +_get_task_data_bufs(struct ap_task *task) +{ + uint32_t align = 0; + + /* For dualcast, the DSA HW requires 4K alignment on destination addresses but + * we do this for all engines to keep it simple. + */ + if (g_workload_selection == ACCEL_DUALCAST) { + align = ALIGN_4K; + } + + task->src = spdk_dma_zmalloc(g_xfer_size_bytes, 0, NULL); + if (task->src == NULL) { + fprintf(stderr, "Unable to alloc src buffer\n"); + return -ENOMEM; + } + memset(task->src, DATA_PATTERN, g_xfer_size_bytes); + + task->dst = spdk_dma_zmalloc(g_xfer_size_bytes, align, NULL); + if (task->dst == NULL) { + fprintf(stderr, "Unable to alloc dst buffer\n"); + return -ENOMEM; + } + + /* For compare we want the buffers to match, otherwise not. */ + if (g_workload_selection == ACCEL_COMPARE) { + memset(task->dst, DATA_PATTERN, g_xfer_size_bytes); + } else { + memset(task->dst, ~DATA_PATTERN, g_xfer_size_bytes); + } + + /* For fill, set the entire src buffer so we can check if verify is enabled. */ + if (g_workload_selection == ACCEL_FILL) { + memset(task->src, g_fill_pattern, g_xfer_size_bytes); + } + + if (g_workload_selection == ACCEL_DUALCAST) { + task->dst2 = spdk_dma_zmalloc(g_xfer_size_bytes, align, NULL); + if (task->dst2 == NULL) { + fprintf(stderr, "Unable to alloc dst buffer\n"); + return -ENOMEM; + } + memset(task->dst2, ~DATA_PATTERN, g_xfer_size_bytes); + } + + return 0; +} + +static int +_batch_prep_cmd(struct worker_thread *worker, struct ap_task *task, struct spdk_accel_batch *batch) +{ + int rc = 0; + + switch (g_workload_selection) { + case ACCEL_COPY: + rc = spdk_accel_batch_prep_copy(worker->ch, batch, task->dst, + task->src, g_xfer_size_bytes, accel_done, task); + break; + case ACCEL_DUALCAST: + rc = spdk_accel_batch_prep_dualcast(worker->ch, batch, task->dst, task->dst2, + task->src, g_xfer_size_bytes, accel_done, task); + break; + case ACCEL_COMPARE: + rc = spdk_accel_batch_prep_compare(worker->ch, batch, task->dst, task->src, + g_xfer_size_bytes, accel_done, task); + break; + case ACCEL_FILL: + rc = spdk_accel_batch_prep_fill(worker->ch, batch, task->dst, + *(uint8_t *)task->src, + g_xfer_size_bytes, accel_done, task); + break; + case ACCEL_CRC32C: + rc = spdk_accel_batch_prep_crc32c(worker->ch, batch, (uint32_t *)task->dst, + task->src, g_crc32c_seed, g_xfer_size_bytes, accel_done, task); + break; + default: + assert(false); + break; + } + + return rc; +} + static void _init_thread(void *arg1) { struct worker_thread *worker; - char task_pool_name[30]; struct ap_task *task; - int i; - uint32_t align = 0; + int i, rc, max_per_batch, batch_count, num_tasks; + int remaining = g_queue_depth; + struct spdk_accel_batch *batch, *new_batch; worker = calloc(1, sizeof(*worker)); if (worker == NULL) { @@ -410,28 +505,24 @@ _init_thread(void *arg1) return; } - /* For dualcast, the DSA HW requires 4K alignment on destination addresses but - * we do this for all engines to keep it simple. - */ - if (g_workload_selection == ACCEL_DUALCAST) { - align = ALIGN_4K; - } - worker->core = spdk_env_get_current_core(); worker->thread = spdk_get_thread(); worker->next = g_workers; worker->ch = spdk_accel_engine_get_io_channel(); - snprintf(task_pool_name, sizeof(task_pool_name), "task_pool_%d", g_num_workers); - worker->task_pool = spdk_mempool_create(task_pool_name, - g_queue_depth, - spdk_accel_task_size() + sizeof(struct ap_task), - SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, - SPDK_ENV_SOCKET_ID_ANY); - if (!worker->task_pool) { - fprintf(stderr, "Could not allocate buffer pool.\n"); - free(worker); - return; + max_per_batch = spdk_accel_batch_get_max(worker->ch); + assert(max_per_batch > 0); + num_tasks = g_queue_depth + spdk_divide_round_up(g_queue_depth, max_per_batch); + + TAILQ_INIT(&worker->tasks); + for (i = 0; i < num_tasks; i++) { + task = calloc(1, sizeof(struct ap_task)); + if (task == NULL) { + fprintf(stderr, "Could not allocate task.\n"); + return; + /* TODO cleanup */ + } + TAILQ_INSERT_TAIL(&worker->tasks, task, link); } /* Register a poller that will stop the worker at time elapsed */ @@ -443,50 +534,108 @@ _init_thread(void *arg1) g_num_workers++; pthread_mutex_unlock(&g_workers_lock); - for (i = 0; i < g_queue_depth; i++) { - task = spdk_mempool_get(worker->task_pool); - if (!task) { - fprintf(stderr, "Unable to get accel_task\n"); - return; - } + /* Batching is only possible if there is at least 2 operations. */ + if (g_queue_depth > 1) { - task->src = spdk_dma_zmalloc(g_xfer_size_bytes, 0, NULL); - if (task->src == NULL) { - fprintf(stderr, "Unable to alloc src buffer\n"); - return; - } - memset(task->src, DATA_PATTERN, g_xfer_size_bytes); + /* Outter loop sets up each batch command, inner loop populates the + * batch descriptors. + */ + do { + new_batch = spdk_accel_batch_create(worker->ch); + if (new_batch == NULL) { + break; + } - task->dst = spdk_dma_zmalloc(g_xfer_size_bytes, align, NULL); - if (task->dst == NULL) { - fprintf(stderr, "Unable to alloc dst buffer\n"); - return; - } + batch = new_batch; + batch_count = 0; + + do { + if (!TAILQ_EMPTY(&worker->tasks)) { + task = TAILQ_FIRST(&worker->tasks); + TAILQ_REMOVE(&worker->tasks, task, link); + } else { + fprintf(stderr, "Unable to get accel_task\n"); + goto error; + } + task->worker = worker; + task->worker->current_queue_depth++; + + if (_get_task_data_bufs(task)) { + fprintf(stderr, "Unable to get data bufs\n"); + goto error; + } + + rc = _batch_prep_cmd(worker, task, batch); + if (rc) { + fprintf(stderr, "error preping command\n"); + goto error; + } + remaining--; + batch_count++; + } while (batch_count < max_per_batch && remaining > 0); + + /* Now send the batch command. */ + if (!TAILQ_EMPTY(&worker->tasks)) { + task = TAILQ_FIRST(&worker->tasks); + TAILQ_REMOVE(&worker->tasks, task, link); + } else { + fprintf(stderr, "Unable to get accel_task\n"); + goto error; + } + task->worker = worker; + task->worker->current_queue_depth++; - if (g_workload_selection == ACCEL_DUALCAST) { - task->dst2 = spdk_dma_zmalloc(g_xfer_size_bytes, align, NULL); - if (task->dst2 == NULL) { - fprintf(stderr, "Unable to alloc dst buffer\n"); - return; + rc = spdk_accel_batch_submit(worker->ch, batch, batch_done, task); + if (rc) { + fprintf(stderr, "error ending batch %d\n", rc); + goto error; } - memset(task->dst2, ~DATA_PATTERN, g_xfer_size_bytes); + /* We can't build a batch unless it has 2 descriptors (per spec). */ + } while (remaining > 1); + + /* If there are no more left, we're done. */ + if (remaining == 0) { + return; } + } - /* For compare we want the buffers to match, otherwise not. */ - if (g_workload_selection == ACCEL_COMPARE) { - memset(task->dst, DATA_PATTERN, g_xfer_size_bytes); + /* For engines that don't support batch or for the odd event that + * a batch ends with only one descriptor left. + */ + for (i = 0; i < remaining; i++) { + + if (!TAILQ_EMPTY(&worker->tasks)) { + task = TAILQ_FIRST(&worker->tasks); + TAILQ_REMOVE(&worker->tasks, task, link); } else { - memset(task->dst, ~DATA_PATTERN, g_xfer_size_bytes); + fprintf(stderr, "Unable to get accel_task\n"); + goto error; + } + + if (_get_task_data_bufs(task)) { + fprintf(stderr, "Unable to get data bufs\n"); + goto error; } _submit_single(worker, task); } + return; +error: + /* TODO clean exit */ + raise(SIGINT); + while (!TAILQ_EMPTY(&worker->tasks)) { + task = TAILQ_FIRST(&worker->tasks); + TAILQ_REMOVE(&worker->tasks, task, link); + free(task); + } + free(worker); + spdk_app_stop(-1); } static void -accel_done(void *ref, int status) +accel_done(void *cb_arg, int status) { - struct ap_task *task = __ap_task_from_accel_task(ref); + struct ap_task *task = (struct ap_task *)cb_arg; struct worker_thread *worker = task->worker; assert(worker); @@ -498,18 +647,15 @@ accel_done(void *ref, int status) static void accel_perf_start(void *arg1) { - uint64_t capabilites; struct spdk_io_channel *accel_ch; accel_ch = spdk_accel_engine_get_io_channel(); - capabilites = spdk_accel_get_capabilities(accel_ch); + g_capabilites = spdk_accel_get_capabilities(accel_ch); spdk_put_io_channel(accel_ch); - if ((capabilites & g_workload_selection) != g_workload_selection) { - SPDK_ERRLOG("Selected workload is not supported by the current engine\n"); - SPDK_NOTICELOG("Software engine is selected by default, enable a HW engine via RPC\n\n"); - spdk_app_stop(-1); - return; + if ((g_capabilites & g_workload_selection) != g_workload_selection) { + SPDK_WARNLOG("The selected workload is not natively supported by the current engine\n"); + SPDK_WARNLOG("The software engine will be used instead.\n\n"); } g_tsc_rate = spdk_get_ticks_hz(); diff --git a/examples/bdev/fio_plugin/Makefile b/examples/bdev/fio_plugin/Makefile index d43221402f1..18c39d2f841 100644 --- a/examples/bdev/fio_plugin/Makefile +++ b/examples/bdev/fio_plugin/Makefile @@ -40,9 +40,8 @@ FIO_PLUGIN := spdk_bdev C_SRCS = fio_plugin.c -# Unable to combine the FIO plugin and the VPP socket abstraction (license incompatibility) -SPDK_LIB_LIST = $(filter-out sock_vpp,$(ALL_MODULES_LIST)) +SPDK_LIB_LIST = $(ALL_MODULES_LIST) SPDK_LIB_LIST += thread util bdev bdev_rpc conf accel rpc jsonrpc json log sock trace notify -SPDK_LIB_LIST += event event_bdev event_accel event_vmd +SPDK_LIB_LIST += event $(EVENT_BDEV_SUBSYSTEM) include $(SPDK_ROOT_DIR)/mk/spdk.fio.mk diff --git a/examples/bdev/fio_plugin/fio_plugin.c b/examples/bdev/fio_plugin/fio_plugin.c index 4f50cfb014d..66b3f7c7d36 100644 --- a/examples/bdev/fio_plugin/fio_plugin.c +++ b/examples/bdev/fio_plugin/fio_plugin.c @@ -87,6 +87,8 @@ struct spdk_fio_thread { struct io_u **iocq; /* io completion queue */ unsigned int iocq_count; /* number of iocq entries filled by last getevents */ unsigned int iocq_size; /* number of iocq entries allocated */ + + TAILQ_ENTRY(spdk_fio_thread) link; }; static bool g_spdk_env_initialized = false; @@ -96,6 +98,12 @@ static int spdk_fio_init(struct thread_data *td); static void spdk_fio_cleanup(struct thread_data *td); static size_t spdk_fio_poll_thread(struct spdk_fio_thread *fio_thread); +static pthread_t g_init_thread_id = 0; +static pthread_mutex_t g_init_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t g_init_cond; +static bool g_poll_loop = true; +static TAILQ_HEAD(, spdk_fio_thread) g_threads = TAILQ_HEAD_INITIALIZER(g_threads); + /* Default polling timeout (ns) */ #define SPDK_FIO_POLLING_TIMEOUT 1000000000ULL @@ -149,19 +157,9 @@ spdk_fio_cleanup_thread(struct spdk_fio_thread *fio_thread) { spdk_thread_send_msg(fio_thread->thread, spdk_fio_bdev_close_targets, fio_thread); - while (!spdk_thread_is_idle(fio_thread->thread)) { - spdk_fio_poll_thread(fio_thread); - } - - spdk_set_thread(fio_thread->thread); - - spdk_thread_exit(fio_thread->thread); - while (!spdk_thread_is_exited(fio_thread->thread)) { - spdk_thread_poll(fio_thread->thread, 0, 0); - } - spdk_thread_destroy(fio_thread->thread); - free(fio_thread->iocq); - free(fio_thread); + pthread_mutex_lock(&g_init_mtx); + TAILQ_INSERT_TAIL(&g_threads, fio_thread, link); + pthread_mutex_unlock(&g_init_mtx); } static void @@ -189,11 +187,6 @@ spdk_fio_calc_timeout(struct spdk_fio_thread *fio_thread, struct timespec *ts) } } -static pthread_t g_init_thread_id = 0; -static pthread_mutex_t g_init_mtx = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t g_init_cond; -static bool g_poll_loop = true; - static void spdk_fio_bdev_init_done(int rc, void *cb_arg) { @@ -232,6 +225,7 @@ spdk_init_thread_poll(void *arg) { struct spdk_fio_options *eo = arg; struct spdk_fio_thread *fio_thread; + struct spdk_fio_thread *thread, *tmp; struct spdk_conf *config = NULL; struct spdk_env_opts opts; bool done; @@ -325,19 +319,37 @@ spdk_init_thread_poll(void *arg) pthread_mutex_lock(&g_init_mtx); pthread_cond_signal(&g_init_cond); + pthread_mutex_unlock(&g_init_mtx); + while (g_poll_loop) { spdk_fio_poll_thread(fio_thread); + pthread_mutex_lock(&g_init_mtx); + if (!TAILQ_EMPTY(&g_threads)) { + TAILQ_FOREACH_SAFE(thread, &g_threads, link, tmp) { + spdk_fio_poll_thread(thread); + } + + /* If there are exiting threads to poll, don't sleep. */ + pthread_mutex_unlock(&g_init_mtx); + continue; + } + + /* Figure out how long to sleep. */ clock_gettime(CLOCK_MONOTONIC, &ts); spdk_fio_calc_timeout(fio_thread, &ts); rc = pthread_cond_timedwait(&g_init_cond, &g_init_mtx, &ts); + pthread_mutex_unlock(&g_init_mtx); + if (rc != ETIMEDOUT) { break; } + + } - pthread_mutex_unlock(&g_init_mtx); + spdk_fio_cleanup_thread(fio_thread); /* Finalize the bdev layer */ done = false; @@ -345,9 +357,32 @@ spdk_init_thread_poll(void *arg) do { spdk_fio_poll_thread(fio_thread); - } while (!done && !spdk_thread_is_idle(fio_thread->thread)); - spdk_fio_cleanup_thread(fio_thread); + TAILQ_FOREACH_SAFE(thread, &g_threads, link, tmp) { + spdk_fio_poll_thread(thread); + } + } while (!done); + + /* Now exit all the threads */ + TAILQ_FOREACH(thread, &g_threads, link) { + spdk_set_thread(thread->thread); + spdk_thread_exit(thread->thread); + spdk_set_thread(NULL); + } + + /* And wait for them to gracefully exit */ + while (!TAILQ_EMPTY(&g_threads)) { + TAILQ_FOREACH_SAFE(thread, &g_threads, link, tmp) { + if (spdk_thread_is_exited(thread->thread)) { + TAILQ_REMOVE(&g_threads, thread, link); + spdk_thread_destroy(thread->thread); + free(thread->iocq); + free(thread); + } else { + spdk_thread_poll(thread->thread, 0, 0); + } + } + } pthread_exit(NULL); diff --git a/examples/bdev/hello_world/Makefile b/examples/bdev/hello_world/Makefile index dc26f45ed98..f4a5a5b6937 100644 --- a/examples/bdev/hello_world/Makefile +++ b/examples/bdev/hello_world/Makefile @@ -38,7 +38,7 @@ APP = hello_bdev C_SRCS := hello_bdev.c SPDK_LIB_LIST = $(ALL_MODULES_LIST) -SPDK_LIB_LIST += event_bdev event_accel event_vmd +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) SPDK_LIB_LIST += bdev_rpc bdev accel event thread util conf trace log jsonrpc json rpc sock notify include $(SPDK_ROOT_DIR)/mk/spdk.app.mk diff --git a/examples/blob/cli/Makefile b/examples/blob/cli/Makefile index 5f1ff09b52e..3c3ff1f26dc 100644 --- a/examples/blob/cli/Makefile +++ b/examples/blob/cli/Makefile @@ -39,7 +39,7 @@ C_SRCS := blobcli.c # Don't link bdev_lvol in blobcli - otherwise this utility cannot operate on an lvolstore SPDK_LIB_LIST = $(filter-out bdev_lvol,$(ALL_MODULES_LIST)) -SPDK_LIB_LIST += event_bdev event_accel event_vmd +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) SPDK_LIB_LIST += bdev accel event thread util conf trace \ log jsonrpc json rpc sock notify diff --git a/examples/blob/hello_world/Makefile b/examples/blob/hello_world/Makefile index b0534420b0a..0b5e8939648 100644 --- a/examples/blob/hello_world/Makefile +++ b/examples/blob/hello_world/Makefile @@ -38,7 +38,7 @@ APP = hello_blob C_SRCS := hello_blob.c SPDK_LIB_LIST = $(ALL_MODULES_LIST) -SPDK_LIB_LIST += event_bdev event_accel event_vmd +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) SPDK_LIB_LIST += bdev accel event thread util conf trace \ log jsonrpc json rpc sock notify diff --git a/examples/blob/hello_world/hello_blob.c b/examples/blob/hello_world/hello_blob.c index b62e446b46d..41730ce8696 100644 --- a/examples/blob/hello_world/hello_blob.c +++ b/examples/blob/hello_world/hello_blob.c @@ -459,7 +459,7 @@ main(int argc, char **argv) * specify a name for the app. */ opts.name = "hello_blob"; - opts.config_file = argv[1]; + opts.json_config_file = argv[1]; /* diff --git a/examples/blob/hello_world/hello_blob.conf b/examples/blob/hello_world/hello_blob.conf deleted file mode 100644 index 3fa7e9d9afa..00000000000 --- a/examples/blob/hello_world/hello_blob.conf +++ /dev/null @@ -1,3 +0,0 @@ -[Malloc] - NumberOfLuns 1 - LunSizeInMB 16 diff --git a/examples/blob/hello_world/hello_blob.json b/examples/blob/hello_world/hello_blob.json new file mode 100644 index 00000000000..10ded9d474c --- /dev/null +++ b/examples/blob/hello_world/hello_blob.json @@ -0,0 +1,17 @@ +{ + "subsystems": [ + { + "subsystem": "bdev", + "config": [ + { + "method": "bdev_malloc_create", + "params": { + "name": "Malloc0", + "num_blocks": 32768, + "block_size": 512 + } + } + ] + } + ] +} diff --git a/examples/ioat/verify/verify.c b/examples/ioat/verify/verify.c index 0df41f69be1..441e32bfd75 100644 --- a/examples/ioat/verify/verify.c +++ b/examples/ioat/verify/verify.c @@ -307,7 +307,9 @@ submit_xfers(struct thread_entry *thread_entry, uint64_t queue_depth) while (queue_depth-- > 0) { struct ioat_task *ioat_task = NULL; ioat_task = spdk_mempool_get(thread_entry->task_pool); + assert(ioat_task != NULL); ioat_task->buffer = spdk_mempool_get(thread_entry->data_pool); + assert(ioat_task->buffer != NULL); ioat_task->type = IOAT_COPY_TYPE; if (spdk_ioat_get_dma_capabilities(thread_entry->chan) & SPDK_IOAT_ENGINE_FILL_SUPPORTED) { diff --git a/examples/nvme/Makefile b/examples/nvme/Makefile index b69efc97565..14eeb9be7ef 100644 --- a/examples/nvme/Makefile +++ b/examples/nvme/Makefile @@ -35,7 +35,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk DIRS-y += hello_world identify perf reconnect nvme_manage arbitration \ - hotplug cmb_copy + hotplug cmb_copy abort DIRS-$(CONFIG_FIO_PLUGIN) += fio_plugin diff --git a/examples/nvme/abort/.gitignore b/examples/nvme/abort/.gitignore new file mode 100644 index 00000000000..f7d13fd0466 --- /dev/null +++ b/examples/nvme/abort/.gitignore @@ -0,0 +1 @@ +abort diff --git a/examples/nvme/abort/Makefile b/examples/nvme/abort/Makefile new file mode 100644 index 00000000000..5073a842d1f --- /dev/null +++ b/examples/nvme/abort/Makefile @@ -0,0 +1,38 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) + +APP = abort + +include $(SPDK_ROOT_DIR)/mk/nvme.libtest.mk diff --git a/examples/nvme/abort/abort.c b/examples/nvme/abort/abort.c new file mode 100644 index 00000000000..390453e4681 --- /dev/null +++ b/examples/nvme/abort/abort.c @@ -0,0 +1,1147 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/log.h" +#include "spdk/nvme.h" +#include "spdk/queue.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/likely.h" + +struct ctrlr_entry { + struct spdk_nvme_ctrlr *ctrlr; + enum spdk_nvme_transport_type trtype; + + struct ctrlr_entry *next; + char name[1024]; +}; + +struct ns_entry { + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_ns *ns; + + struct ns_entry *next; + uint32_t io_size_blocks; + uint32_t num_io_requests; + uint64_t size_in_ios; + uint32_t block_size; + char name[1024]; +}; + +struct ctrlr_worker_ctx { + pthread_mutex_t mutex; + struct ctrlr_entry *entry; + uint64_t abort_submitted; + uint64_t abort_submit_failed; + uint64_t successful_abort; + uint64_t unsuccessful_abort; + uint64_t abort_failed; + uint64_t current_queue_depth; + struct spdk_nvme_ctrlr *ctrlr; + struct ctrlr_worker_ctx *next; +}; + +struct ns_worker_ctx { + struct ns_entry *entry; + uint64_t io_submitted; + uint64_t io_completed; + uint64_t io_aborted; + uint64_t io_failed; + uint64_t current_queue_depth; + uint64_t offset_in_ios; + bool is_draining; + struct spdk_nvme_qpair *qpair; + struct ctrlr_worker_ctx *ctrlr_ctx; + struct ns_worker_ctx *next; +}; + +struct perf_task { + struct ns_worker_ctx *ns_ctx; + void *buf; +}; + +struct worker_thread { + struct ns_worker_ctx *ns_ctx; + struct ctrlr_worker_ctx *ctrlr_ctx; + struct worker_thread *next; + unsigned lcore; +}; + +static const char *g_workload_type = "read"; +static struct ctrlr_entry *g_controllers; +static struct ns_entry *g_namespaces; +static int g_num_namespaces; +static struct worker_thread *g_workers; +static int g_num_workers; +static uint32_t g_master_core; + +static int g_abort_interval = 1; + +static uint64_t g_tsc_rate; + +static uint32_t g_io_size_bytes = 131072; +static uint32_t g_max_io_size_blocks; +static int g_rw_percentage = -1; +static int g_is_random; +static int g_queue_depth = 128; +static int g_time_in_sec = 3; +static int g_dpdk_mem; +static int g_shm_id = -1; +static bool g_no_pci; +static bool g_warn; +static bool g_mix_specified; + +static const char *g_core_mask; + +struct trid_entry { + struct spdk_nvme_transport_id trid; + uint16_t nsid; + TAILQ_ENTRY(trid_entry) tailq; +}; + +static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list); + +static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl); + +static int +build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport_id *trid; + int res = 0; + + trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); + + switch (trid->trtype) { + case SPDK_NVME_TRANSPORT_PCIE: + res = snprintf(name, length, "PCIE (%s)", trid->traddr); + break; + case SPDK_NVME_TRANSPORT_RDMA: + res = snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); + break; + case SPDK_NVME_TRANSPORT_TCP: + res = snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); + break; + case SPDK_NVME_TRANSPORT_CUSTOM: + res = snprintf(name, length, "CUSTOM (%s)", trid->traddr); + break; + + default: + fprintf(stderr, "Unknown transport type %d\n", trid->trtype); + break; + } + return res; +} + +static void +build_nvme_ns_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + int res = 0; + + res = build_nvme_name(name, length, ctrlr); + if (res > 0) { + snprintf(name + res, length - res, " NSID %u", nsid); + } + +} + +static void +register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) +{ + struct ns_entry *entry; + const struct spdk_nvme_ctrlr_data *cdata; + uint32_t max_xfer_size, entries, sector_size; + uint64_t ns_size; + struct spdk_nvme_io_qpair_opts opts; + + cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + if (!spdk_nvme_ns_is_active(ns)) { + printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n", + cdata->mn, cdata->sn, + spdk_nvme_ns_get_id(ns)); + g_warn = true; + return; + } + + ns_size = spdk_nvme_ns_get_size(ns); + sector_size = spdk_nvme_ns_get_sector_size(ns); + + if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) { + printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid " + "ns size %" PRIu64 " / block size %u for I/O size %u\n", + cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns), + ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes); + g_warn = true; + return; + } + + max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); + spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); + /* NVMe driver may add additional entries based on + * stripe size and maximum transfer size, we assume + * 1 more entry be used for stripe. + */ + entries = (g_io_size_bytes - 1) / max_xfer_size + 2; + if ((g_queue_depth * entries) > opts.io_queue_size) { + printf("controller IO queue size %u less than required\n", + opts.io_queue_size); + printf("Consider using lower queue depth or small IO size because " + "IO requests may be queued at the NVMe driver.\n"); + } + /* For requests which have children requests, parent request itself + * will also occupy 1 entry. + */ + entries += 1; + + entry = calloc(1, sizeof(struct ns_entry)); + if (entry == NULL) { + perror("ns_entry malloc"); + exit(1); + } + + entry->ctrlr = ctrlr; + entry->ns = ns; + entry->num_io_requests = g_queue_depth * entries; + + entry->size_in_ios = ns_size / g_io_size_bytes; + entry->io_size_blocks = g_io_size_bytes / sector_size; + + entry->block_size = spdk_nvme_ns_get_sector_size(ns); + + if (g_max_io_size_blocks < entry->io_size_blocks) { + g_max_io_size_blocks = entry->io_size_blocks; + } + + build_nvme_ns_name(entry->name, sizeof(entry->name), ctrlr, spdk_nvme_ns_get_id(ns)); + + g_num_namespaces++; + entry->next = g_namespaces; + g_namespaces = entry; +} + +static void +unregister_namespaces(void) +{ + struct ns_entry *entry = g_namespaces; + + while (entry) { + struct ns_entry *next = entry->next; + free(entry); + entry = next; + } +} + +static void +register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry) +{ + struct spdk_nvme_ns *ns; + struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry)); + uint32_t nsid; + + if (entry == NULL) { + perror("ctrlr_entry malloc"); + exit(1); + } + + build_nvme_name(entry->name, sizeof(entry->name), ctrlr); + + entry->ctrlr = ctrlr; + entry->trtype = trid_entry->trid.trtype; + entry->next = g_controllers; + g_controllers = entry; + + if (trid_entry->nsid == 0) { + for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); + nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + continue; + } + register_ns(ctrlr, ns); + } + } else { + ns = spdk_nvme_ctrlr_get_ns(ctrlr, trid_entry->nsid); + if (!ns) { + perror("Namespace does not exist."); + exit(1); + } + + register_ns(ctrlr, ns); + } +} + +static void +abort_complete(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct ctrlr_worker_ctx *ctrlr_ctx = ctx; + + ctrlr_ctx->current_queue_depth--; + if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { + ctrlr_ctx->abort_failed++; + } else if ((cpl->cdw0 & 0x1) == 0) { + ctrlr_ctx->successful_abort++; + } else { + ctrlr_ctx->unsuccessful_abort++; + } +} + +static void +abort_task(struct perf_task *task) +{ + struct ns_worker_ctx *ns_ctx = task->ns_ctx; + struct ctrlr_worker_ctx *ctrlr_ctx = ns_ctx->ctrlr_ctx; + int rc; + + /* Hold mutex to guard ctrlr_ctx->current_queue_depth. */ + pthread_mutex_lock(&ctrlr_ctx->mutex); + + rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ctx->ctrlr, ns_ctx->qpair, task, abort_complete, + ctrlr_ctx); + + if (spdk_unlikely(rc != 0)) { + ctrlr_ctx->abort_submit_failed++; + } else { + ctrlr_ctx->current_queue_depth++; + ctrlr_ctx->abort_submitted++; + } + + pthread_mutex_unlock(&ctrlr_ctx->mutex); +} + +static __thread unsigned int seed = 0; + +static inline void +submit_single_io(struct perf_task *task) +{ + uint64_t offset_in_ios, lba; + int rc; + struct ns_worker_ctx *ns_ctx = task->ns_ctx; + struct ns_entry *entry = ns_ctx->entry; + + if (g_is_random) { + offset_in_ios = rand_r(&seed) % entry->size_in_ios; + } else { + offset_in_ios = ns_ctx->offset_in_ios++; + if (ns_ctx->offset_in_ios == entry->size_in_ios) { + ns_ctx->offset_in_ios = 0; + } + } + + lba = offset_in_ios * entry->io_size_blocks; + + if ((g_rw_percentage == 100) || + (g_rw_percentage != 0 && (rand_r(&seed) % 100) < g_rw_percentage)) { + rc = spdk_nvme_ns_cmd_read(entry->ns, ns_ctx->qpair, task->buf, + lba, entry->io_size_blocks, io_complete, task, 0); + } else { + rc = spdk_nvme_ns_cmd_write(entry->ns, ns_ctx->qpair, task->buf, + lba, entry->io_size_blocks, io_complete, task, 0); + } + + if (spdk_unlikely(rc != 0)) { + fprintf(stderr, "I/O submission failed\n"); + } else { + ns_ctx->current_queue_depth++; + ns_ctx->io_submitted++; + + if ((ns_ctx->io_submitted % g_abort_interval) == 0) { + abort_task(task); + } + } + +} + +static void +io_complete(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct perf_task *task = ctx; + struct ns_worker_ctx *ns_ctx = task->ns_ctx; + + ns_ctx->current_queue_depth--; + if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { + ns_ctx->io_failed++; + } else { + ns_ctx->io_completed++; + } + + /* is_draining indicates when time has expired for the test run and we are + * just waiting for the previously submitted I/O to complete. In this case, + * do not submit a new I/O to replace the one just completed. + */ + if (spdk_unlikely(ns_ctx->is_draining)) { + spdk_dma_free(task->buf); + free(task); + } else { + submit_single_io(task); + } +} + +static struct perf_task * +allocate_task(struct ns_worker_ctx *ns_ctx) +{ + struct perf_task *task; + + task = calloc(1, sizeof(*task)); + if (task == NULL) { + fprintf(stderr, "Failed to allocate task\n"); + exit(1); + } + + task->buf = spdk_dma_zmalloc(g_io_size_bytes, 0x200, NULL); + if (task->buf == NULL) { + free(task); + fprintf(stderr, "Failed to allocate task->buf\n"); + exit(1); + } + + task->ns_ctx = ns_ctx; + + return task; +} + +static void +submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth) +{ + struct perf_task *task; + + while (queue_depth-- > 0) { + task = allocate_task(ns_ctx); + submit_single_io(task); + } +} + +static int +work_fn(void *arg) +{ + struct worker_thread *worker = (struct worker_thread *)arg; + struct ns_worker_ctx *ns_ctx; + struct ctrlr_worker_ctx *ctrlr_ctx; + struct ns_entry *ns_entry; + struct spdk_nvme_io_qpair_opts opts; + uint64_t tsc_end; + uint32_t unfinished_ctx; + + /* Allocate queue pair for each namespace. */ + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + ns_entry = ns_ctx->entry; + + spdk_nvme_ctrlr_get_default_io_qpair_opts(ns_entry->ctrlr, &opts, sizeof(opts)); + if (opts.io_queue_requests < ns_entry->num_io_requests) { + opts.io_queue_requests = ns_entry->num_io_requests; + } + + ns_ctx->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ns_entry->ctrlr, &opts, sizeof(opts)); + if (ns_ctx->qpair == NULL) { + fprintf(stderr, "spdk_nvme_ctrlr_alloc_io_qpair failed\n"); + return 1; + } + + ns_ctx = ns_ctx->next; + } + + tsc_end = spdk_get_ticks() + g_time_in_sec * g_tsc_rate; + + /* Submit initial I/O for each namespace. */ + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + submit_io(ns_ctx, g_queue_depth); + ns_ctx = ns_ctx->next; + } + + while (1) { + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + spdk_nvme_qpair_process_completions(ns_ctx->qpair, 0); + ns_ctx = ns_ctx->next; + } + + if (worker->lcore == g_master_core) { + ctrlr_ctx = worker->ctrlr_ctx; + while (ctrlr_ctx) { + /* Hold mutex to guard ctrlr_ctx->current_queue_depth. */ + pthread_mutex_lock(&ctrlr_ctx->mutex); + spdk_nvme_ctrlr_process_admin_completions(ctrlr_ctx->ctrlr); + pthread_mutex_unlock(&ctrlr_ctx->mutex); + ctrlr_ctx = ctrlr_ctx->next; + } + } + + if (spdk_get_ticks() > tsc_end) { + break; + } + } + + do { + unfinished_ctx = 0; + + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + if (!ns_ctx->is_draining) { + ns_ctx->is_draining = true; + } + if (ns_ctx->current_queue_depth > 0) { + spdk_nvme_qpair_process_completions(ns_ctx->qpair, 0); + if (ns_ctx->current_queue_depth == 0) { + spdk_nvme_ctrlr_free_io_qpair(ns_ctx->qpair); + } else { + unfinished_ctx++; + } + } + ns_ctx = ns_ctx->next; + } + } while (unfinished_ctx > 0); + + if (worker->lcore == g_master_core) { + do { + unfinished_ctx = 0; + + ctrlr_ctx = worker->ctrlr_ctx; + while (ctrlr_ctx != NULL) { + pthread_mutex_lock(&ctrlr_ctx->mutex); + if (ctrlr_ctx->current_queue_depth > 0) { + spdk_nvme_ctrlr_process_admin_completions(ctrlr_ctx->ctrlr); + if (ctrlr_ctx->current_queue_depth > 0) { + unfinished_ctx++; + } + } + pthread_mutex_unlock(&ctrlr_ctx->mutex); + ctrlr_ctx = ctrlr_ctx->next; + } + } while (unfinished_ctx > 0); + } + + return 0; +} + +static void +usage(char *program_name) +{ + printf("%s options", program_name); + + printf("\n"); + printf("\t[-q io depth]\n"); + printf("\t[-o io size in bytes]\n"); + printf("\t[-w io pattern type, must be one of\n"); + printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n"); + printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n"); + printf("\t[-t time in seconds]\n"); + printf("\t[-c core mask for I/O submission/completion.]\n"); + printf("\t\t(default: 1)\n"); + printf("\t[-r Transport ID for local PCIe NVMe or NVMeoF]\n"); + printf("\t Format: 'key:value [key:value] ...'\n"); + printf("\t Keys:\n"); + printf("\t trtype Transport type (e.g. PCIe, RDMA)\n"); + printf("\t adrfam Address family (e.g. IPv4, IPv6)\n"); + printf("\t traddr Transport address (e.g. 0000:04:00.0 for PCIe or 192.168.100.8 for RDMA)\n"); + printf("\t trsvcid Transport service identifier (e.g. 4420)\n"); + printf("\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN); + printf("\t Example: -r 'trtype:PCIe traddr:0000:04:00.0' for PCIe or\n"); + printf("\t -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n"); + printf("\t[-s DPDK huge memory size in MB.]\n"); + printf("\t[-i shared memory group ID]\n"); + printf("\t[-a abort interval.]\n"); + printf("\t"); + spdk_log_usage(stdout, "-T"); +#ifdef DEBUG + printf("\t[-G enable debug logging]\n"); +#else + printf("\t[-G enable debug logging (flag disabled, must reconfigure with --enable-debug)\n"); +#endif +} + +static void +unregister_trids(void) +{ + struct trid_entry *trid_entry, *tmp; + + TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) { + TAILQ_REMOVE(&g_trid_list, trid_entry, tailq); + free(trid_entry); + } +} + +static int +add_trid(const char *trid_str) +{ + struct trid_entry *trid_entry; + struct spdk_nvme_transport_id *trid; + char *ns; + + trid_entry = calloc(1, sizeof(*trid_entry)); + if (trid_entry == NULL) { + return -1; + } + + trid = &trid_entry->trid; + trid->trtype = SPDK_NVME_TRANSPORT_PCIE; + snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); + + if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) { + fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str); + free(trid_entry); + return 1; + } + + spdk_nvme_transport_id_populate_trstring(trid, + spdk_nvme_transport_id_trtype_str(trid->trtype)); + + ns = strcasestr(trid_str, "ns:"); + if (ns) { + char nsid_str[6]; /* 5 digits maximum in an nsid */ + int len; + int nsid; + + ns += 3; + + len = strcspn(ns, " \t\n"); + if (len > 5) { + fprintf(stderr, "NVMe namespace IDs must be 5 digits or less\n"); + free(trid_entry); + return 1; + } + + memcpy(nsid_str, ns, len); + nsid_str[len] = '\0'; + + nsid = spdk_strtol(nsid_str, 10); + if (nsid <= 0 || nsid > 65535) { + fprintf(stderr, "NVMe namespace IDs must be less than 65536 and greater than 0\n"); + free(trid_entry); + return 1; + } + + trid_entry->nsid = (uint16_t)nsid; + } + + TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq); + return 0; +} + +static int +parse_args(int argc, char **argv) +{ + int op; + long int val; + int rc; + + while ((op = getopt(argc, argv, "a:c:i:o:q:r:s:t:w:M:")) != -1) { + switch (op) { + case 'a': + case 'i': + case 'o': + case 'q': + case 's': + case 't': + case 'M': + val = spdk_strtol(optarg, 10); + if (val < 0) { + fprintf(stderr, "Converting a string to integer failed\n"); + return val; + } + switch (op) { + case 'a': + g_abort_interval = val; + break; + case 'i': + g_shm_id = val; + break; + case 'o': + g_io_size_bytes = val; + break; + case 'q': + g_queue_depth = val; + break; + case 's': + g_dpdk_mem = val; + break; + case 't': + g_time_in_sec = val; + break; + case 'M': + g_rw_percentage = val; + g_mix_specified = true; + break; + } + break; + case 'c': + g_core_mask = optarg; + break; + case 'r': + if (add_trid(optarg)) { + usage(argv[0]); + return 1; + } + break; + case 'w': + g_workload_type = optarg; + break; + case 'G': +#ifndef DEBUG + fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n", + argv[0]); + usage(argv[0]); + return 1; +#else + spdk_log_set_flag("nvme"); + spdk_log_set_print_level(SPDK_LOG_DEBUG); + break; +#endif + case 'T': + rc = spdk_log_set_flag(optarg); + if (rc < 0) { + fprintf(stderr, "unknown flag\n"); + usage(argv[0]); + exit(EXIT_FAILURE); + } + spdk_log_set_print_level(SPDK_LOG_DEBUG); +#ifndef DEBUG + fprintf(stderr, "%s must be rebuilt with CONFIG_DEBUG=y for -T flag.\n", + argv[0]); + usage(argv[0]); + return 0; +#endif + break; + default: + usage(argv[0]); + return 1; + } + } + + if (!g_queue_depth) { + fprintf(stderr, "missing -q (queue size) operand\n"); + usage(argv[0]); + return 1; + } + if (!g_io_size_bytes) { + fprintf(stderr, "missing -o (block size) operand\n"); + usage(argv[0]); + return 1; + } + if (!g_workload_type) { + fprintf(stderr, "missing -t (test time in seconds) operand\n"); + usage(argv[0]); + return 1; + } + + if (!g_time_in_sec) { + usage(argv[0]); + return 1; + } + + if (strncmp(g_workload_type, "rand", 4) == 0) { + g_is_random = 1; + g_workload_type = &g_workload_type[4]; + } + + if (strcmp(g_workload_type, "read") == 0 || strcmp(g_workload_type, "write") == 0) { + g_rw_percentage = strcmp(g_workload_type, "read") == 0 ? 100 : 0; + if (g_mix_specified) { + fprintf(stderr, "Ignoring -M option... Please use -M option" + " only when using rw or randrw.\n"); + } + } else if (strcmp(g_workload_type, "rw") == 0) { + if (g_rw_percentage < 0 || g_rw_percentage > 100) { + fprintf(stderr, + "-M must be specified to value from 0 to 100 " + "for rw or randrw.\n"); + return 1; + } + } else { + fprintf(stderr, + "io pattern type must be one of\n" + "(read, write, randread, randwrite, rw, randrw)\n"); + return 1; + } + + if (TAILQ_EMPTY(&g_trid_list)) { + /* If no transport IDs specified, default to enumerating all local PCIe devices */ + add_trid("trtype:PCIe"); + } else { + struct trid_entry *trid_entry, *trid_entry_tmp; + + g_no_pci = true; + /* check whether there is local PCIe type */ + TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) { + if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + g_no_pci = false; + break; + } + } + } + + return 0; +} + +static int +register_workers(void) +{ + uint32_t i; + struct worker_thread *worker; + + g_workers = NULL; + g_num_workers = 0; + + SPDK_ENV_FOREACH_CORE(i) { + worker = calloc(1, sizeof(*worker)); + if (worker == NULL) { + fprintf(stderr, "Unable to allocate worker\n"); + return -1; + } + + worker->lcore = i; + worker->next = g_workers; + g_workers = worker; + g_num_workers++; + } + + return 0; +} + +static void +unregister_workers(void) +{ + struct worker_thread *worker = g_workers; + + /* Free namespace context and worker thread */ + while (worker) { + struct worker_thread *next_worker = worker->next; + struct ns_worker_ctx *ns_ctx = worker->ns_ctx; + + while (ns_ctx) { + struct ns_worker_ctx *next_ns_ctx = ns_ctx->next; + + printf("NS: %s I/O completed: %lu, failed: %lu\n", + ns_ctx->entry->name, ns_ctx->io_completed, ns_ctx->io_failed); + free(ns_ctx); + ns_ctx = next_ns_ctx; + } + + struct ctrlr_worker_ctx *ctrlr_ctx = worker->ctrlr_ctx; + + while (ctrlr_ctx) { + struct ctrlr_worker_ctx *next_ctrlr_ctx = ctrlr_ctx->next; + + printf("CTRLR: %s abort submitted %lu, failed to submit %lu\n", + ctrlr_ctx->entry->name, ctrlr_ctx->abort_submitted, + ctrlr_ctx->abort_submit_failed); + printf("\t success %lu, unsuccess %lu, failed %lu\n", + ctrlr_ctx->successful_abort, ctrlr_ctx->unsuccessful_abort, + ctrlr_ctx->abort_failed); + free(ctrlr_ctx); + ctrlr_ctx = next_ctrlr_ctx; + } + + free(worker); + worker = next_worker; + } +} + +static bool +probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + return true; +} + +static void +attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + struct trid_entry *trid_entry = cb_ctx; + struct spdk_pci_addr pci_addr; + struct spdk_pci_device *pci_dev; + struct spdk_pci_id pci_id; + + if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) { + printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n", + trid->traddr, trid->trsvcid, + trid->subnqn); + } else { + if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) { + return; + } + + pci_dev = spdk_nvme_ctrlr_get_pci_device(ctrlr); + if (!pci_dev) { + return; + } + + pci_id = spdk_pci_device_get_id(pci_dev); + + printf("Attached to NVMe Controller at %s [%04x:%04x]\n", + trid->traddr, + pci_id.vendor_id, pci_id.device_id); + } + + register_ctrlr(ctrlr, trid_entry); +} + +static int +register_controllers(void) +{ + struct trid_entry *trid_entry; + + printf("Initializing NVMe Controllers\n"); + + TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) { + if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) { + fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n", + trid_entry->trid.traddr); + return -1; + } + } + + return 0; +} + +static void +unregister_controllers(void) +{ + struct ctrlr_entry *entry = g_controllers; + + while (entry) { + struct ctrlr_entry *next = entry->next; + spdk_nvme_detach(entry->ctrlr); + free(entry); + entry = next; + } +} + +static int +associate_master_worker_with_ctrlr(void) +{ + struct ctrlr_entry *entry = g_controllers; + struct worker_thread *worker = g_workers; + struct ctrlr_worker_ctx *ctrlr_ctx; + + while (worker) { + if (worker->lcore == g_master_core) { + break; + } + worker = worker->next; + } + + if (!worker) { + return -1; + } + + while (entry) { + ctrlr_ctx = calloc(1, sizeof(struct ctrlr_worker_ctx)); + if (!ctrlr_ctx) { + return -1; + } + + pthread_mutex_init(&ctrlr_ctx->mutex, NULL); + ctrlr_ctx->entry = entry; + ctrlr_ctx->ctrlr = entry->ctrlr; + ctrlr_ctx->next = worker->ctrlr_ctx; + worker->ctrlr_ctx = ctrlr_ctx; + + entry = entry->next; + } + + return 0; +} + +static struct ctrlr_worker_ctx * +get_ctrlr_worker_ctx(struct spdk_nvme_ctrlr *ctrlr) +{ + struct worker_thread *worker = g_workers; + struct ctrlr_worker_ctx *ctrlr_ctx; + + while (worker != NULL) { + if (worker->lcore == g_master_core) { + break; + } + worker = worker->next; + } + + if (!worker) { + return NULL; + } + + ctrlr_ctx = worker->ctrlr_ctx; + + while (ctrlr_ctx != NULL) { + if (ctrlr_ctx->ctrlr == ctrlr) { + return ctrlr_ctx; + } + ctrlr_ctx = ctrlr_ctx->next; + } + + return NULL; +} + +static int +associate_workers_with_ns(void) +{ + struct ns_entry *entry = g_namespaces; + struct worker_thread *worker = g_workers; + struct ns_worker_ctx *ns_ctx; + int i, count; + + count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers; + + for (i = 0; i < count; i++) { + if (entry == NULL) { + break; + } + + ns_ctx = calloc(1, sizeof(struct ns_worker_ctx)); + if (!ns_ctx) { + return -1; + } + + printf("Associating %s with lcore %d\n", entry->name, worker->lcore); + ns_ctx->entry = entry; + ns_ctx->ctrlr_ctx = get_ctrlr_worker_ctx(entry->ctrlr); + if (!ns_ctx->ctrlr_ctx) { + free(ns_ctx); + return -1; + } + + ns_ctx->next = worker->ns_ctx; + worker->ns_ctx = ns_ctx; + + worker = worker->next; + if (worker == NULL) { + worker = g_workers; + } + + entry = entry->next; + if (entry == NULL) { + entry = g_namespaces; + } + } + + return 0; +} + +int main(int argc, char **argv) +{ + int rc; + struct worker_thread *worker, *master_worker; + struct spdk_env_opts opts; + + rc = parse_args(argc, argv); + if (rc != 0) { + return rc; + } + + spdk_env_opts_init(&opts); + opts.name = "abort"; + opts.shm_id = g_shm_id; + if (g_core_mask) { + opts.core_mask = g_core_mask; + } + + if (g_dpdk_mem) { + opts.mem_size = g_dpdk_mem; + } + if (g_no_pci) { + opts.no_pci = g_no_pci; + } + if (spdk_env_init(&opts) < 0) { + fprintf(stderr, "Unable to initialize SPDK env\n"); + rc = -1; + goto cleanup; + } + + g_tsc_rate = spdk_get_ticks_hz(); + + if (register_workers() != 0) { + rc = -1; + goto cleanup; + } + + if (register_controllers() != 0) { + rc = -1; + goto cleanup; + } + + if (g_warn) { + printf("WARNING: Some requested NVMe devices were skipped\n"); + } + + if (g_num_namespaces == 0) { + fprintf(stderr, "No valid NVMe controllers found\n"); + goto cleanup; + } + + if (associate_master_worker_with_ctrlr() != 0) { + rc = -1; + goto cleanup; + } + + if (associate_workers_with_ns() != 0) { + rc = -1; + goto cleanup; + } + + printf("Initialization complete. Launching workers.\n"); + + /* Launch all of the slave workers */ + g_master_core = spdk_env_get_current_core(); + master_worker = NULL; + worker = g_workers; + while (worker != NULL) { + if (worker->lcore != g_master_core) { + spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker); + } else { + assert(master_worker == NULL); + master_worker = worker; + } + worker = worker->next; + } + + assert(master_worker != NULL); + rc = work_fn(master_worker); + + spdk_env_thread_wait_all(); + +cleanup: + unregister_trids(); + unregister_workers(); + unregister_namespaces(); + unregister_controllers(); + + if (rc != 0) { + fprintf(stderr, "%s: errors occured\n", argv[0]); + } + + return rc; +} diff --git a/examples/nvme/arbitration/arbitration.c b/examples/nvme/arbitration/arbitration.c index 32a99401fe0..4440760418a 100644 --- a/examples/nvme/arbitration/arbitration.c +++ b/examples/nvme/arbitration/arbitration.c @@ -158,19 +158,13 @@ register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) cdata = spdk_nvme_ctrlr_get_data(ctrlr); - if (!spdk_nvme_ns_is_active(ns)) { - printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n", - cdata->mn, cdata->sn, - spdk_nvme_ns_get_id(ns)); - return; - } - if (spdk_nvme_ns_get_size(ns) < g_arbitration.io_size_bytes || - spdk_nvme_ns_get_sector_size(ns) > g_arbitration.io_size_bytes) { + spdk_nvme_ns_get_extended_sector_size(ns) > g_arbitration.io_size_bytes || + g_arbitration.io_size_bytes % spdk_nvme_ns_get_extended_sector_size(ns)) { printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid " "ns size %" PRIu64 " / block size %u for I/O size %u\n", cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns), - spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_sector_size(ns), + spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_extended_sector_size(ns), g_arbitration.io_size_bytes); return; } @@ -231,7 +225,7 @@ set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable) static void register_ctrlr(struct spdk_nvme_ctrlr *ctrlr) { - int nsid, num_ns; + uint32_t nsid; struct spdk_nvme_ns *ns; struct ctrlr_entry *entry = calloc(1, sizeof(struct ctrlr_entry)); union spdk_nvme_cap_register cap = spdk_nvme_ctrlr_get_regs_cap(ctrlr); @@ -253,8 +247,8 @@ register_ctrlr(struct spdk_nvme_ctrlr *ctrlr) set_latency_tracking_feature(ctrlr, true); } - num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); - for (nsid = 1; nsid <= num_ns; nsid++) { + for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); nsid != 0; + nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); if (ns == NULL) { continue; diff --git a/examples/nvme/fio_plugin/Makefile b/examples/nvme/fio_plugin/Makefile index 5ffdac80106..75caa3d789c 100644 --- a/examples/nvme/fio_plugin/Makefile +++ b/examples/nvme/fio_plugin/Makefile @@ -40,9 +40,8 @@ FIO_PLUGIN := spdk_nvme C_SRCS = fio_plugin.c -# Unable to combine the FIO plugin and the VPP socket abstraction (license incompatibility) -SPDK_LIB_LIST = $(filter-out sock_vpp,$(SOCK_MODULES_LIST)) -SPDK_LIB_LIST += nvme thread util log sock vmd +SPDK_LIB_LIST = $(SOCK_MODULES_LIST) +SPDK_LIB_LIST += nvme thread util log sock vmd jsonrpc json rpc ifeq ($(CONFIG_RDMA),y) SPDK_LIB_LIST += rdma diff --git a/examples/nvme/fio_plugin/fio_plugin.c b/examples/nvme/fio_plugin/fio_plugin.c index b3c2fd26499..7aabeb8cbd2 100644 --- a/examples/nvme/fio_plugin/fio_plugin.c +++ b/examples/nvme/fio_plugin/fio_plugin.c @@ -66,6 +66,12 @@ static uint16_t g_spdk_apptag_mask; struct spdk_fio_options { void *pad; /* off1 used in option descriptions may not be 0 */ + int enable_wrr; + int arbitration_burst; + int low_weight; + int medium_weight; + int high_weight; + int wrr_priority; int mem_size; int shm_id; int enable_sgl; @@ -190,6 +196,14 @@ probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", fio_options->hostnqn); } + if (fio_options->enable_wrr) { + opts->arb_mechanism = SPDK_NVME_CC_AMS_WRR; + opts->arbitration_burst = fio_options->arbitration_burst; + opts->low_priority_weight = fio_options->low_weight; + opts->medium_priority_weight = fio_options->medium_weight; + opts->high_priority_weight = fio_options->high_weight; + } + if (fio_options->digest_enable) { if (strcasecmp(fio_options->digest_enable, "HEADER") == 0) { opts->header_digest = true; @@ -234,6 +248,7 @@ attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, uint32_t ns_id; char *p; long int tmp; + struct spdk_fio_options *fio_options = td->eo; p = strstr(f->file_name, "ns="); if (p != NULL) { @@ -307,6 +322,9 @@ attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, spdk_nvme_ctrlr_get_default_io_qpair_opts(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts)); qpopts.delay_cmd_submit = true; + if (fio_options->enable_wrr) { + qpopts.qprio = fio_options->wrr_priority; + } fio_qpair->qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts)); if (!fio_qpair->qpair) { @@ -1023,6 +1041,66 @@ static void spdk_fio_cleanup(struct thread_data *td) * Adding new parameters by defining them here and defining a callback * function to read the parameter value. */ static struct fio_option options[] = { + { + .name = "enable_wrr", + .lname = "Enable weighted round robin (WRR) for IO submission queues", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, enable_wrr), + .def = "0", + .help = "Enable weighted round robin (WRR) for IO submission queues", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "arbitration_burst", + .lname = "Arbitration Burst", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, arbitration_burst), + .def = "0", + .help = "Arbitration Burst used for WRR (valid range from 0 - 7)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "low_weight", + .lname = "low_weight for WRR", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, low_weight), + .def = "0", + .help = "low_weight used for WRR (valid range from 0 - 255)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "medium_weight", + .lname = "medium_weight for WRR", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, medium_weight), + .def = "0", + .help = "medium weight used for WRR (valid range from 0 - 255)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "high_weight", + .lname = "high_weight for WRR", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, high_weight), + .def = "0", + .help = "high weight used for WRR (valid range from 0 - 255)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "wrr_priority", + .lname = "priority used for WRR", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, wrr_priority), + .def = "0", + .help = "priority used for WRR (valid range from 0-3)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, { .name = "mem_size_mb", .lname = "Memory size in MB", diff --git a/examples/nvme/identify/identify.c b/examples/nvme/identify/identify.c index 722f8d3eeb2..06d828b294b 100644 --- a/examples/nvme/identify/identify.c +++ b/examples/nvme/identify/identify.c @@ -65,6 +65,10 @@ static struct spdk_nvme_health_information_page health_page; static struct spdk_nvme_firmware_page firmware_page; +static struct spdk_nvme_ana_page *g_ana_log_page; + +static size_t g_ana_log_page_size; + static struct spdk_nvme_cmds_and_effect_log_page cmd_effects_log_page; static struct spdk_nvme_intel_smart_information_page intel_smart_page; @@ -266,6 +270,19 @@ get_firmware_log_page(struct spdk_nvme_ctrlr *ctrlr) return 0; } +static int +get_ana_log_page(struct spdk_nvme_ctrlr *ctrlr) +{ + if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, + SPDK_NVME_GLOBAL_NS_TAG, g_ana_log_page, g_ana_log_page_size, 0, + get_log_page_completion, NULL)) { + printf("spdk_nvme_ctrlr_cmd_get_log_page() failed\n"); + exit(1); + } + + return 0; +} + static int get_cmd_effects_log_page(struct spdk_nvme_ctrlr *ctrlr) { @@ -441,6 +458,26 @@ get_log_pages(struct spdk_nvme_ctrlr *ctrlr) } } + if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr, SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS)) { + /* We always set RGO (Return Groups Only) to 0 in this tool, an ANA group + * descriptor is returned only if that ANA group contains namespaces + * that are attached to the controller processing the command, and + * namespaces attached to the controller shall be members of an ANA group. + * Hence the following size should be enough. + */ + g_ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * + sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn * + sizeof(uint32_t); + g_ana_log_page = calloc(1, g_ana_log_page_size); + if (g_ana_log_page == NULL) { + exit(1); + } + if (get_ana_log_page(ctrlr) == 0) { + outstanding_commands++; + } else { + printf("Get Log Page (Asymmetric Namespace Access) failed\n"); + } + } if (cdata->lpa.celp) { if (get_cmd_effects_log_page(ctrlr) == 0) { outstanding_commands++; @@ -662,8 +699,9 @@ print_ocssd_geometry(struct spdk_ocssd_geometry_data *geometry_data) } static void -print_namespace(struct spdk_nvme_ns *ns) +print_namespace(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) { + const struct spdk_nvme_ctrlr_data *cdata; const struct spdk_nvme_ns_data *nsdata; const struct spdk_uuid *uuid; uint32_t i; @@ -671,6 +709,7 @@ print_namespace(struct spdk_nvme_ns *ns) char uuid_str[SPDK_UUID_STRING_LEN]; uint32_t blocksize; + cdata = spdk_nvme_ctrlr_get_data(ctrlr); nsdata = spdk_nvme_ns_get_data(ns); flags = spdk_nvme_ns_get_flags(ns); @@ -764,6 +803,11 @@ print_namespace(struct spdk_nvme_ns *ns) printf("NGUID/EUI64 Never Reused: %s\n", nsdata->nsfeat.guid_never_reused ? "Yes" : "No"); + + if (cdata->cmic.ana_reporting) { + printf("ANA group ID: %u\n", nsdata->anagrpid); + } + printf("Number of LBA Formats: %d\n", nsdata->nlbaf + 1); printf("Current LBA Format: LBA Format #%02d\n", nsdata->flbas.format); @@ -772,7 +816,7 @@ print_namespace(struct spdk_nvme_ns *ns) i, 1 << nsdata->lbaf[i].lbads, nsdata->lbaf[i].ms); printf("\n"); - if (spdk_nvme_ctrlr_is_ocssd_supported(spdk_nvme_ns_get_ctrlr(ns))) { + if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { get_ocssd_geometry(ns, &geometry_data); print_ocssd_geometry(&geometry_data); get_ocssd_chunk_info_log_page(ns); @@ -886,12 +930,13 @@ print_controller(struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_transport union spdk_nvme_vs_register vs; union spdk_nvme_cmbsz_register cmbsz; uint8_t str[512]; - uint32_t i; + uint32_t i, j; struct spdk_nvme_error_information_entry *error_entry; struct spdk_pci_addr pci_addr; struct spdk_pci_device *pci_dev; struct spdk_pci_id pci_id; uint32_t nsid; + struct spdk_nvme_ana_group_descriptor *desc; cap = spdk_nvme_ctrlr_get_regs_cap(ctrlr); vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); @@ -1087,6 +1132,32 @@ print_controller(struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_transport } printf("Per-Namespace SMART Log: %s\n", cdata->lpa.ns_smart ? "Yes" : "No"); + if (cdata->cmic.ana_reporting == 0) { + printf("Asymmetric Namespace Access Log Page: Not Supported\n"); + } else { + printf("Asymmetric Namespace Access Log Page: Supported\n"); + printf("ANA Transition Time : %u sec\n", cdata->anatt); + printf("\n"); + printf("Aymmetric Namespace Access Capabilities\n"); + printf(" ANA Optimized State : %s\n", + cdata->anacap.ana_optimized_state ? "Supported" : "Not Supported"); + printf(" ANA Non-Optimized State : %s\n", + cdata->anacap.ana_non_optimized_state ? "Supported" : "Not Supported"); + printf(" ANA Inaccessible State : %s\n", + cdata->anacap.ana_inaccessible_state ? "Supported" : "Not Supported"); + printf(" ANA Persistent Loss State : %s\n", + cdata->anacap.ana_persistent_loss_state ? "Supported" : "Not Supported"); + printf(" ANA Change State : %s\n", + cdata->anacap.ana_change_state ? "Supported" : "Not Supported"); + printf(" ANAGRPID is not changed : %s\n", + cdata->anacap.no_change_anagrpid ? "Yes" : "No"); + printf(" Non-Zero ANAGRPID for NS Mgmt Cmd : %s\n", + cdata->anacap.non_zero_anagrpid ? "Supported" : "Not Supported"); + printf("\n"); + printf("ANA Group Identifier Maximum : %u\n", cdata->anagrpmax); + printf("Number of ANA Group Identifiers : %u\n", cdata->nanagrpid); + printf("Max Number of Allowed Namespaces : %u\n", cdata->mnan); + } printf("Command Effects Log Page: %s\n", cdata->lpa.celp ? "Supported" : "Not Supported"); printf("Get Log Page Extended Data: %s\n", @@ -1183,6 +1254,36 @@ print_controller(struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_transport } printf("\n"); + if (g_ana_log_page) { + printf("Asymmetric Namespace Access\n"); + printf("===========================\n"); + if (g_hex_dump) { + hex_dump(g_ana_log_page, g_ana_log_page_size); + printf("\n"); + } + + printf("Change Count : %" PRIx64 "\n", g_ana_log_page->change_count); + printf("Number of ANA Group Descriptors : %u\n", g_ana_log_page->num_ana_group_desc); + + desc = (void *)((uint8_t *)g_ana_log_page + sizeof(struct spdk_nvme_ana_page)); + + for (i = 0; i < g_ana_log_page->num_ana_group_desc; i++) { + printf("ANA Group Descriptor : %u\n", i); + printf(" ANA Group ID : %u\n", desc->ana_group_id); + printf(" Number of NSID Values : %u\n", desc->num_of_nsid); + printf(" Change Count : %" PRIx64 "\n", desc->change_count); + printf(" ANA State : %u\n", desc->ana_state); + for (j = 0; j < desc->num_of_nsid; j++) { + printf(" Namespace Identifier : %u\n", desc->nsid[j]); + } + desc = (void *)((uint8_t *)desc + sizeof(struct spdk_nvme_ana_group_descriptor) + + desc->num_of_nsid * sizeof(uint32_t)); + } + free(g_ana_log_page); + } + + printf("\n"); + if (cdata->lpa.celp) { printf("Commands Supported and Effects\n"); printf("==============================\n"); @@ -1571,7 +1672,7 @@ print_controller(struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_transport printf("=================\n"); for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { - print_namespace(spdk_nvme_ctrlr_get_ns(ctrlr, nsid)); + print_namespace(ctrlr, spdk_nvme_ctrlr_get_ns(ctrlr, nsid)); } if (g_discovery_page) { diff --git a/examples/nvme/perf/Makefile b/examples/nvme/perf/Makefile index 139681d29b7..0742f18427d 100644 --- a/examples/nvme/perf/Makefile +++ b/examples/nvme/perf/Makefile @@ -35,13 +35,13 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) APP = perf +include $(SPDK_ROOT_DIR)/mk/nvme.libtest.mk + ifeq ($(OS),Linux) SYS_LIBS += -laio CFLAGS += -DHAVE_LIBAIO endif -include $(SPDK_ROOT_DIR)/mk/nvme.libtest.mk - install: $(APP) $(INSTALL_EXAMPLE) diff --git a/examples/nvme/perf/perf.c b/examples/nvme/perf/perf.c index 6aecd0de40c..e03ceaf77c8 100644 --- a/examples/nvme/perf/perf.c +++ b/examples/nvme/perf/perf.c @@ -49,6 +49,10 @@ #include "spdk/log.h" #include "spdk/likely.h" +#ifdef SPDK_CONFIG_URING +#include +#endif + #if HAVE_LIBAIO #include #endif @@ -67,6 +71,7 @@ struct ctrlr_entry { enum entry_type { ENTRY_TYPE_NVME_NS, ENTRY_TYPE_AIO_FILE, + ENTRY_TYPE_URING_FILE, }; struct ns_fn_table; @@ -80,9 +85,14 @@ struct ns_entry { struct spdk_nvme_ctrlr *ctrlr; struct spdk_nvme_ns *ns; } nvme; -#if HAVE_LIBAIO +#ifdef SPDK_CONFIG_URING struct { int fd; + } uring; +#endif +#if HAVE_LIBAIO + struct { + int fd; } aio; #endif } u; @@ -119,13 +129,17 @@ static const double g_latency_cutoffs[] = { -1, }; -struct ns_worker_ctx { - struct ns_entry *entry; +struct ns_worker_stats { uint64_t io_completed; uint64_t last_io_completed; uint64_t total_tsc; uint64_t min_tsc; uint64_t max_tsc; +}; + +struct ns_worker_ctx { + struct ns_entry *entry; + struct ns_worker_stats stats; uint64_t current_queue_depth; uint64_t offset_in_ios; bool is_draining; @@ -139,6 +153,15 @@ struct ns_worker_ctx { int last_qpair; } nvme; +#ifdef SPDK_CONFIG_URING + struct { + struct io_uring ring; + uint64_t io_inflight; + uint64_t io_pending; + struct io_uring_cqe **cqes; + + } uring; +#endif #if HAVE_LIBAIO struct { struct io_event *events; @@ -213,16 +236,20 @@ static int g_queue_depth; static int g_nr_io_queues_per_ns = 1; static int g_nr_unused_io_queues; static int g_time_in_sec; +static int g_warmup_time_in_sec; static uint32_t g_max_completions; static int g_dpdk_mem; static int g_shm_id = -1; static uint32_t g_disable_sq_cmb; +static bool g_use_uring; static bool g_no_pci; static bool g_warn; static bool g_header_digest; static bool g_data_digest; static bool g_no_shn_notification; static bool g_mix_specified; +/* The flag is used to exit the program while keep alive fails on the transport */ +static bool g_exit; /* Default to 10 seconds for the keep alive value. This value is arbitrary. */ static uint32_t g_keep_alive_timeout_in_ms = 10000; @@ -236,12 +263,126 @@ struct trid_entry { static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list); -static int g_aio_optind; /* Index of first AIO filename in argv */ +static int g_file_optind; /* Index of first filename in argv */ static inline void task_complete(struct perf_task *task); -#if HAVE_LIBAIO +#ifdef SPDK_CONFIG_URING + +static void +uring_setup_payload(struct perf_task *task, uint8_t pattern) +{ + task->iov.iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL); + task->iov.iov_len = g_io_size_bytes; + if (task->iov.iov_base == NULL) { + fprintf(stderr, "spdk_dma_zmalloc() for task->iov.iov_base failed\n"); + exit(1); + } + memset(task->iov.iov_base, pattern, task->iov.iov_len); +} + +static int +uring_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, + struct ns_entry *entry, uint64_t offset_in_ios) +{ + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(&ns_ctx->u.uring.ring); + if (!sqe) { + fprintf(stderr, "Cannot get sqe\n"); + return -1; + } + + if (task->is_read) { + io_uring_prep_readv(sqe, entry->u.uring.fd, &task->iov, 1, offset_in_ios * task->iov.iov_len); + } else { + io_uring_prep_writev(sqe, entry->u.uring.fd, &task->iov, 1, offset_in_ios * task->iov.iov_len); + } + + io_uring_sqe_set_data(sqe, task); + ns_ctx->u.uring.io_pending++; + + return 0; +} + +static void +uring_check_io(struct ns_worker_ctx *ns_ctx) +{ + int i, count, to_complete, to_submit, ret = 0; + struct perf_task *task; + + to_submit = ns_ctx->u.uring.io_pending; + + if (to_submit > 0) { + /* If there are I/O to submit, use io_uring_submit here. + * It will automatically call spdk_io_uring_enter appropriately. */ + ret = io_uring_submit(&ns_ctx->u.uring.ring); + if (ret < 0) { + return; + } + ns_ctx->u.uring.io_pending = 0; + ns_ctx->u.uring.io_inflight += to_submit; + } + + to_complete = ns_ctx->u.uring.io_inflight; + if (to_complete > 0) { + count = io_uring_peek_batch_cqe(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes, to_complete); + ns_ctx->u.uring.io_inflight -= count; + for (i = 0; i < count; i++) { + assert(ns_ctx->u.uring.cqes[i] != NULL); + task = (struct perf_task *)ns_ctx->u.uring.cqes[i]->user_data; + if (ns_ctx->u.uring.cqes[i]->res != (int)task->iov.iov_len) { + fprintf(stderr, "cqe[i]->status=%d\n", ns_ctx->u.uring.cqes[i]->res); + exit(0); + } + io_uring_cqe_seen(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes[i]); + task_complete(task); + } + } +} + +static void +uring_verify_io(struct perf_task *task, struct ns_entry *entry) +{ +} + +static int +uring_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) +{ + if (io_uring_queue_init(g_queue_depth, &ns_ctx->u.uring.ring, 0) < 0) { + SPDK_ERRLOG("uring I/O context setup failure\n"); + return -1; + } + + ns_ctx->u.uring.cqes = calloc(g_queue_depth, sizeof(struct io_uring_cqe *)); + if (!ns_ctx->u.uring.cqes) { + io_uring_queue_exit(&ns_ctx->u.uring.ring); + return -1; + } + + return 0; +} + +static void +uring_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) +{ + io_uring_queue_exit(&ns_ctx->u.uring.ring); + free(ns_ctx->u.uring.cqes); +} + +static const struct ns_fn_table uring_fn_table = { + .setup_payload = uring_setup_payload, + .submit_io = uring_submit_io, + .check_io = uring_check_io, + .verify_io = uring_verify_io, + .init_ns_worker_ctx = uring_init_ns_worker_ctx, + .cleanup_ns_worker_ctx = uring_cleanup_ns_worker_ctx, +}; + +#endif + +#ifdef HAVE_LIBAIO static void aio_setup_payload(struct perf_task *task, uint8_t pattern) { @@ -344,8 +485,12 @@ static const struct ns_fn_table aio_fn_table = { .cleanup_ns_worker_ctx = aio_cleanup_ns_worker_ctx, }; +#endif /* HAVE_LIBAIO */ + +#if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING) + static int -register_aio_file(const char *path) +register_file(const char *path) { struct ns_entry *entry; @@ -365,20 +510,20 @@ register_aio_file(const char *path) fd = open(path, flags); if (fd < 0) { - fprintf(stderr, "Could not open AIO device %s: %s\n", path, strerror(errno)); + fprintf(stderr, "Could not open device %s: %s\n", path, strerror(errno)); return -1; } size = spdk_fd_get_size(fd); if (size == 0) { - fprintf(stderr, "Could not determine size of AIO device %s\n", path); + fprintf(stderr, "Could not determine size of device %s\n", path); close(fd); return -1; } blklen = spdk_fd_get_blocklen(fd); if (blklen == 0) { - fprintf(stderr, "Could not determine block size of AIO device %s\n", path); + fprintf(stderr, "Could not determine block size of device %s\n", path); close(fd); return -1; } @@ -394,13 +539,23 @@ register_aio_file(const char *path) entry = malloc(sizeof(struct ns_entry)); if (entry == NULL) { close(fd); - perror("aio ns_entry malloc"); + perror("ns_entry malloc"); return -1; } - entry->type = ENTRY_TYPE_AIO_FILE; - entry->fn_table = &aio_fn_table; - entry->u.aio.fd = fd; + if (g_use_uring) { +#ifdef SPDK_CONFIG_URING + entry->type = ENTRY_TYPE_URING_FILE; + entry->fn_table = &uring_fn_table; + entry->u.uring.fd = fd; +#endif + } else { +#if HAVE_LIBAIO + entry->type = ENTRY_TYPE_AIO_FILE; + entry->fn_table = &aio_fn_table; + entry->u.aio.fd = fd; +#endif + } entry->size_in_ios = size / g_io_size_bytes; entry->io_size_blocks = g_io_size_bytes / blklen; @@ -414,20 +569,20 @@ register_aio_file(const char *path) } static int -register_aio_files(int argc, char **argv) +register_files(int argc, char **argv) { int i; - /* Treat everything after the options as files for AIO */ - for (i = g_aio_optind; i < argc; i++) { - if (register_aio_file(argv[i]) != 0) { + /* Treat everything after the options as files for AIO/URING */ + for (i = g_file_optind; i < argc; i++) { + if (register_file(argv[i]) != 0) { return 1; } } return 0; } -#endif /* HAVE_LIBAIO */ +#endif static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl); @@ -612,7 +767,7 @@ nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) ns_ctx->u.nvme.group = spdk_nvme_poll_group_create(NULL); if (ns_ctx->u.nvme.group == NULL) { - return -1; + goto poll_group_failed; } group = ns_ctx->u.nvme.group; @@ -622,21 +777,35 @@ nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) qpair = ns_ctx->u.nvme.qpair[i]; if (!qpair) { printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n"); - return -1; + goto qpair_failed; } if (spdk_nvme_poll_group_add(group, qpair)) { printf("ERROR: unable to add I/O qpair to poll group.\n"); - return -1; + spdk_nvme_ctrlr_free_io_qpair(qpair); + goto qpair_failed; } if (spdk_nvme_ctrlr_connect_io_qpair(entry->u.nvme.ctrlr, qpair)) { printf("ERROR: unable to connect I/O qpair.\n"); - return -1; + spdk_nvme_poll_group_remove(group, qpair); + spdk_nvme_ctrlr_free_io_qpair(qpair); + goto qpair_failed; } } return 0; + +qpair_failed: + for (; i > 0; --i) { + spdk_nvme_poll_group_remove(ns_ctx->u.nvme.group, ns_ctx->u.nvme.qpair[i - 1]); + spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i - 1]); + } + + spdk_nvme_poll_group_destroy(ns_ctx->u.nvme.group); +poll_group_failed: + free(ns_ctx->u.nvme.qpair); + return -1; } static void @@ -678,7 +847,10 @@ build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) res = snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); break; case SPDK_NVME_TRANSPORT_TCP: - res = snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); + res = snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); + break; + case SPDK_NVME_TRANSPORT_CUSTOM: + res = snprintf(name, length, "CUSTOM (%s)", trid->traddr); break; default: @@ -942,14 +1114,14 @@ task_complete(struct perf_task *task) ns_ctx = task->ns_ctx; entry = ns_ctx->entry; ns_ctx->current_queue_depth--; - ns_ctx->io_completed++; + ns_ctx->stats.io_completed++; tsc_diff = spdk_get_ticks() - task->submit_tsc; - ns_ctx->total_tsc += tsc_diff; - if (spdk_unlikely(ns_ctx->min_tsc > tsc_diff)) { - ns_ctx->min_tsc = tsc_diff; + ns_ctx->stats.total_tsc += tsc_diff; + if (spdk_unlikely(ns_ctx->stats.min_tsc > tsc_diff)) { + ns_ctx->stats.min_tsc = tsc_diff; } - if (spdk_unlikely(ns_ctx->max_tsc < tsc_diff)) { - ns_ctx->max_tsc = tsc_diff; + if (spdk_unlikely(ns_ctx->stats.max_tsc < tsc_diff)) { + ns_ctx->stats.max_tsc = tsc_diff; } if (spdk_unlikely(g_latency_sw_tracking_level > 0)) { spdk_histogram_data_tally(ns_ctx->histogram, tsc_diff); @@ -1031,7 +1203,7 @@ cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) } static void -print_periodic_performance(void) +print_periodic_performance(bool warmup) { uint64_t io_this_second; double mb_this_second; @@ -1050,15 +1222,15 @@ print_periodic_performance(void) while (worker) { ns_ctx = worker->ns_ctx; while (ns_ctx) { - io_this_second += ns_ctx->io_completed - ns_ctx->last_io_completed; - ns_ctx->last_io_completed = ns_ctx->io_completed; + io_this_second += ns_ctx->stats.io_completed - ns_ctx->stats.last_io_completed; + ns_ctx->stats.last_io_completed = ns_ctx->stats.io_completed; ns_ctx = ns_ctx->next; } worker = worker->next; } mb_this_second = (double)io_this_second * g_io_size_bytes / (1024 * 1024); - printf("%9ju IOPS, %8.2f MiB/s\r", io_this_second, mb_this_second); + printf("%s%9ju IOPS, %8.2f MiB/s\r", warmup ? "[warmup] " : "", io_this_second, mb_this_second); fflush(stdout); } @@ -1066,9 +1238,10 @@ static int work_fn(void *arg) { uint64_t tsc_end, tsc_current, tsc_next_print; - struct worker_thread *worker = (struct worker_thread *)arg; + struct worker_thread *worker = (struct worker_thread *) arg; struct ns_worker_ctx *ns_ctx = NULL; uint32_t unfinished_ns_ctx; + bool warmup = false; /* Allocate queue pairs for each namespace. */ ns_ctx = worker->ns_ctx; @@ -1081,9 +1254,15 @@ work_fn(void *arg) } tsc_current = spdk_get_ticks(); - tsc_end = tsc_current + g_time_in_sec * g_tsc_rate; tsc_next_print = tsc_current + g_tsc_rate; + if (g_warmup_time_in_sec) { + warmup = true; + tsc_end = tsc_current + g_warmup_time_in_sec * g_tsc_rate; + } else { + tsc_end = tsc_current + g_time_in_sec * g_tsc_rate; + } + /* Submit initial I/O for each namespace. */ ns_ctx = worker->ns_ctx; while (ns_ctx != NULL) { @@ -1091,7 +1270,7 @@ work_fn(void *arg) ns_ctx = ns_ctx->next; } - while (1) { + while (spdk_likely(!g_exit)) { /* * Check for completed I/O for each controller. A new * I/O will be submitted in the io_complete callback @@ -1107,11 +1286,30 @@ work_fn(void *arg) if (worker->lcore == g_master_core && tsc_current > tsc_next_print) { tsc_next_print += g_tsc_rate; - print_periodic_performance(); + print_periodic_performance(warmup); } if (tsc_current > tsc_end) { - break; + if (warmup) { + /* Update test end time, clear statistics */ + tsc_end = tsc_current + g_time_in_sec * g_tsc_rate; + ns_ctx = worker->ns_ctx; + + while (ns_ctx != NULL) { + memset(&ns_ctx->stats, 0, sizeof(ns_ctx->stats)); + ns_ctx->stats.min_tsc = UINT64_MAX; + ns_ctx = ns_ctx->next; + } + + if (worker->lcore == g_master_core && isatty(STDOUT_FILENO)) { + /* warmup stage prints a longer string to stdout, need to erase it */ + printf("%c[2K", 27); + } + + warmup = false; + } else { + break; + } } } @@ -1143,8 +1341,8 @@ work_fn(void *arg) static void usage(char *program_name) { printf("%s options", program_name); -#if HAVE_LIBAIO - printf(" [AIO device(s)]..."); +#if defined(SPDK_CONFIG_URING) || defined(HAVE_LIBAIO) + printf(" [Kernel device(s)]..."); #endif printf("\n"); printf("\t[-q io depth]\n"); @@ -1158,6 +1356,7 @@ static void usage(char *program_name) printf("\t\t-L for latency summary, -LL for detailed histogram\n"); printf("\t[-l enable latency tracking via ssd (if supported), default: disabled]\n"); printf("\t[-t time in seconds]\n"); + printf("\t[-a warmup time in seconds]\n"); printf("\t[-c core mask for I/O submission/completion.]\n"); printf("\t\t(default: 1)\n"); printf("\t[-D disable submission queue in controller memory buffer, default: enabled]\n"); @@ -1188,6 +1387,9 @@ static void usage(char *program_name) printf("\t"); spdk_log_usage(stdout, "-T"); printf("\t[-V enable VMD enumeration]\n"); +#ifdef SPDK_CONFIG_URING + printf("\t[-R enable using liburing to drive kernel devices (Default: libaio)]\n"); +#endif #ifdef DEBUG printf("\t[-G enable debug logging]\n"); #else @@ -1270,16 +1472,17 @@ print_performance(void) while (worker) { ns_ctx = worker->ns_ctx; while (ns_ctx) { - if (ns_ctx->io_completed != 0) { - io_per_second = (double)ns_ctx->io_completed / g_time_in_sec; + if (ns_ctx->stats.io_completed != 0) { + io_per_second = (double)ns_ctx->stats.io_completed / g_time_in_sec; mb_per_second = io_per_second * g_io_size_bytes / (1024 * 1024); - average_latency = ((double)ns_ctx->total_tsc / ns_ctx->io_completed) * 1000 * 1000 / g_tsc_rate; - min_latency = (double)ns_ctx->min_tsc * 1000 * 1000 / g_tsc_rate; + average_latency = ((double)ns_ctx->stats.total_tsc / ns_ctx->stats.io_completed) * 1000 * 1000 / + g_tsc_rate; + min_latency = (double)ns_ctx->stats.min_tsc * 1000 * 1000 / g_tsc_rate; if (min_latency < min_latency_so_far) { min_latency_so_far = min_latency; } - max_latency = (double)ns_ctx->max_tsc * 1000 * 1000 / g_tsc_rate; + max_latency = (double)ns_ctx->stats.max_tsc * 1000 * 1000 / g_tsc_rate; if (max_latency > max_latency_so_far) { max_latency_so_far = max_latency; } @@ -1290,8 +1493,8 @@ print_performance(void) average_latency, min_latency, max_latency); total_io_per_second += io_per_second; total_mb_per_second += mb_per_second; - total_io_completed += ns_ctx->io_completed; - total_io_tsc += ns_ctx->total_tsc; + total_io_completed += ns_ctx->stats.io_completed; + total_io_tsc += ns_ctx->stats.total_tsc; ns_count++; } ns_ctx = ns_ctx->next; @@ -1598,8 +1801,9 @@ parse_args(int argc, char **argv) long int val; int rc; - while ((op = getopt(argc, argv, "c:e:i:lo:q:r:k:s:t:w:C:DGHILM:NP:T:U:V")) != -1) { + while ((op = getopt(argc, argv, "a:c:e:i:lo:q:r:k:s:t:w:C:DGHILM:NP:RT:U:V")) != -1) { switch (op) { + case 'a': case 'i': case 'C': case 'P': @@ -1616,6 +1820,9 @@ parse_args(int argc, char **argv) return val; } switch (op) { + case 'a': + g_warmup_time_in_sec = val; + break; case 'i': g_shm_id = val; break; @@ -1696,6 +1903,15 @@ parse_args(int argc, char **argv) case 'N': g_no_shn_notification = true; break; + case 'R': +#ifndef SPDK_CONFIG_URING + fprintf(stderr, "%s must be rebuilt with CONFIG_URING=y for -R flag.\n", + argv[0]); + usage(argv[0]); + return 0; +#endif + g_use_uring = true; + break; case 'T': rc = spdk_log_set_flag(optarg); if (rc < 0) { @@ -1787,7 +2003,7 @@ parse_args(int argc, char **argv) } } - g_aio_optind = optind; + g_file_optind = optind; return 0; } @@ -1976,7 +2192,7 @@ associate_workers_with_ns(void) } printf("Associating %s with lcore %d\n", entry->name, worker->lcore); - ns_ctx->min_tsc = UINT64_MAX; + ns_ctx->stats.min_tsc = UINT64_MAX; ns_ctx->entry = entry; ns_ctx->next = worker->ns_ctx; ns_ctx->histogram = spdk_histogram_data_alloc(); @@ -2002,6 +2218,7 @@ nvme_poll_ctrlrs(void *arg) { struct ctrlr_entry *entry; int oldstate; + int rc; spdk_unaffinitize_thread(); @@ -2011,7 +2228,10 @@ nvme_poll_ctrlrs(void *arg) entry = g_controllers; while (entry) { if (entry->trtype != SPDK_NVME_TRANSPORT_PCIE) { - spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr); + rc = spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr); + if (spdk_unlikely(rc < 0 && !g_exit)) { + g_exit = true; + } } entry = entry->next; } @@ -2063,8 +2283,8 @@ int main(int argc, char **argv) goto cleanup; } -#if HAVE_LIBAIO - if (register_aio_files(argc, argv) != 0) { +#if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING) + if (register_files(argc, argv) != 0) { rc = -1; goto cleanup; } @@ -2080,7 +2300,7 @@ int main(int argc, char **argv) } if (g_num_namespaces == 0) { - fprintf(stderr, "No valid NVMe controllers or AIO devices found\n"); + fprintf(stderr, "No valid NVMe controllers or AIO or URING devices found\n"); goto cleanup; } diff --git a/examples/nvme/reconnect/reconnect.c b/examples/nvme/reconnect/reconnect.c index 74c5f36571c..54d1ff3686c 100644 --- a/examples/nvme/reconnect/reconnect.c +++ b/examples/nvme/reconnect/reconnect.c @@ -263,7 +263,10 @@ build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); break; case SPDK_NVME_TRANSPORT_TCP: - snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); + snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); + break; + case SPDK_NVME_TRANSPORT_CUSTOM: + snprintf(name, length, "CUSTOM (%s)", trid->traddr); break; default: fprintf(stderr, "Unknown transport type %d\n", trid->trtype); diff --git a/examples/nvmf/nvmf/Makefile b/examples/nvmf/nvmf/Makefile index a2574bde2d1..ccf5273bed9 100644 --- a/examples/nvmf/nvmf/Makefile +++ b/examples/nvmf/nvmf/Makefile @@ -40,7 +40,7 @@ APP := nvmf C_SRCS := nvmf.c SPDK_LIB_LIST = $(ALL_MODULES_LIST) SPDK_LIB_LIST += nvmf thread util bdev conf accel rpc jsonrpc json log sock trace notify -SPDK_LIB_LIST += event event_bdev event_accel event_vmd +SPDK_LIB_LIST += event $(EVENT_BDEV_SUBSYSTEM) ifeq ($(CONFIG_FC),y) ifneq ($(strip $(CONFIG_FC_PATH)),) diff --git a/examples/nvmf/nvmf/nvmf.c b/examples/nvmf/nvmf/nvmf.c index e789b484123..6ee7b44f81e 100644 --- a/examples/nvmf/nvmf/nvmf.c +++ b/examples/nvmf/nvmf/nvmf.c @@ -486,11 +486,17 @@ static void nvmf_tgt_subsystem_stop_next(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status) { + int rc; + subsystem = spdk_nvmf_subsystem_get_next(subsystem); if (subsystem) { - spdk_nvmf_subsystem_stop(subsystem, - nvmf_tgt_subsystem_stop_next, - cb_arg); + rc = spdk_nvmf_subsystem_stop(subsystem, + nvmf_tgt_subsystem_stop_next, + cb_arg); + if (rc) { + nvmf_tgt_subsystem_stop_next(subsystem, cb_arg, 0); + fprintf(stderr, "Unable to stop NVMe-oF subsystem. Trying others.\n"); + } return; } @@ -504,12 +510,17 @@ static void nvmf_tgt_stop_subsystems(struct nvmf_target *nvmf_tgt) { struct spdk_nvmf_subsystem *subsystem; + int rc; subsystem = spdk_nvmf_subsystem_get_first(nvmf_tgt->tgt); if (spdk_likely(subsystem)) { - spdk_nvmf_subsystem_stop(subsystem, - nvmf_tgt_subsystem_stop_next, - NULL); + rc = spdk_nvmf_subsystem_stop(subsystem, + nvmf_tgt_subsystem_stop_next, + NULL); + if (rc) { + nvmf_tgt_subsystem_stop_next(subsystem, NULL, 0); + fprintf(stderr, "Unable to stop NVMe-oF subsystem. Trying others.\n"); + } } else { g_target_state = NVMF_FINI_POLL_GROUPS; } @@ -519,10 +530,17 @@ static void nvmf_tgt_subsystem_start_next(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status) { + int rc; + subsystem = spdk_nvmf_subsystem_get_next(subsystem); if (subsystem) { - spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_start_next, - cb_arg); + rc = spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_start_next, + cb_arg); + if (rc) { + g_target_state = NVMF_FINI_STOP_SUBSYSTEMS; + fprintf(stderr, "Unable to start NVMe-oF subsystem. shutting down app.\n"); + nvmf_target_advance_state(); + } return; } @@ -536,6 +554,7 @@ static void nvmf_tgt_start_subsystems(struct nvmf_target *nvmf_tgt) { struct spdk_nvmf_subsystem *subsystem; + int rc; /* Subsystem is the NVM subsystem which is a combine of namespaces * except the discovery subsystem which is used for discovery service. @@ -548,9 +567,13 @@ nvmf_tgt_start_subsystems(struct nvmf_target *nvmf_tgt) * Start subsystem means make it from inactive to active that means * subsystem start to work or it can be accessed. */ - spdk_nvmf_subsystem_start(subsystem, - nvmf_tgt_subsystem_start_next, - NULL); + rc = spdk_nvmf_subsystem_start(subsystem, + nvmf_tgt_subsystem_start_next, + NULL); + if (rc) { + fprintf(stderr, "Unable to start NVMe-oF subsystem. shutting down app.\n"); + g_target_state = NVMF_FINI_STOP_SUBSYSTEMS; + } } else { g_target_state = NVMF_RUNNING; } diff --git a/examples/sock/hello_world/hello_sock.c b/examples/sock/hello_world/hello_sock.c index 75ae4e63109..fd046f29284 100644 --- a/examples/sock/hello_world/hello_sock.c +++ b/examples/sock/hello_world/hello_sock.c @@ -86,7 +86,7 @@ hello_sock_usage(void) { printf(" -H host_addr host address\n"); printf(" -P port port number\n"); - printf(" -N sock_impl socket implementation, e.g., -N posix or -N vpp\n"); + printf(" -N sock_impl socket implementation, e.g., -N posix or -N uring\n"); printf(" -S start in server mode\n"); printf(" -V print out additional informations"); } diff --git a/include/spdk/accel_engine.h b/include/spdk/accel_engine.h index 766e42e5caf..8c1afaee599 100644 --- a/include/spdk/accel_engine.h +++ b/include/spdk/accel_engine.h @@ -57,7 +57,7 @@ enum accel_capability { /** * Acceleration operation callback. * - * \param ref 'accel_req' passed to the corresponding spdk_accel_submit* call. + * \param ref 'accel_task' passed to the corresponding spdk_accel_submit* call. * \param status 0 if it completed successfully, or negative errno if it failed. */ typedef void (*spdk_accel_completion_cb)(void *ref, int status); @@ -71,7 +71,7 @@ typedef void (*spdk_accel_fini_cb)(void *cb_arg); struct spdk_io_channel; -struct spdk_accel_task; +struct spdk_accel_batch; /** * Initialize the acceleration engine. @@ -112,8 +112,7 @@ struct spdk_io_channel *spdk_accel_engine_get_io_channel(void); /** * Retrieve accel engine capabilities. * - * \param ch I/O channel to submit request to the accel engine. This channel can - * be obtained by the function spdk_accel_engine_get_io_channel(). + * \param ch I/O channel associated with this call. * * \return bitmap of capabilities defined by enum accel_capability. */ @@ -122,97 +121,229 @@ uint64_t spdk_accel_get_capabilities(struct spdk_io_channel *ch); /** * Submit a copy request. * - * \param accel_req Accel request task. - * \param ch I/O channel to submit request to the accel engine. This channel can - * be obtained by the function spdk_accel_engine_get_io_channel(). + * \param ch I/O channel associated with this call. * \param dst Destination to copy to. * \param src Source to copy from. * \param nbytes Length in bytes to copy. - * \param cb Called when this copy operation completes. + * \param cb_fn Called when this copy operation completes. + * \param cb_arg Callback argument. + * + * \return 0 on success, negative errno on failure. + */ +int spdk_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg); + +/** + * Synchronous call to get batch size. This is the maximum number of + * descriptors that a batch can contain. Once this limit is reached the batch + * should be processed with spdk_accel_batch_submit(). + * + * \param ch I/O channel associated with this call. + * + * \return max number of descriptors per batch. + */ +uint32_t spdk_accel_batch_get_max(struct spdk_io_channel *ch); + +/** + * Synchronous call to create a batch sequence. + * + * \param ch I/O channel associated with this call. + * + * \return handle to use for subsequent batch requests, NULL on failure. + */ +struct spdk_accel_batch *spdk_accel_batch_create(struct spdk_io_channel *ch); + +/** + * Asynchronous call to submit a batch sequence. + * + * \param ch I/O channel associated with this call. + * \param batch Handle provided when the batch was started with spdk_accel_batch_create(). + * \param cb_fn Called when this operation completes. + * \param cb_arg Callback argument. + * + * \return 0 on success, negative errno on failure. + */ +int spdk_accel_batch_submit(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + spdk_accel_completion_cb cb_fn, void *cb_arg); + +/** + * Synchronous call to cancel a batch sequence. In some cases prepared commands will be + * processed if they cannot be cancelled. + * + * \param ch I/O channel associated with this call. + * \param batch Handle provided when the batch was started with spdk_accel_batch_create(). + * + * \return 0 on success, negative errno on failure. + */ +int spdk_accel_batch_cancel(struct spdk_io_channel *ch, struct spdk_accel_batch *batch); + +/** + * Synchronous call to prepare a copy request into a previously initialized batch + * created with spdk_accel_batch_create(). The callback will be called when the copy + * completes after the batch has been submitted by an asynchronous call to + * spdk_accel_batch_submit(). + * + * \param ch I/O channel associated with this call. + * \param batch Handle provided when the batch was started with spdk_accel_batch_create(). + * \param dst Destination to copy to. + * \param src Source to copy from. + * \param nbytes Length in bytes to copy. + * \param cb_fn Called when this operation completes. + * \param cb_arg Callback argument. + * + * \return 0 on success, negative errno on failure. + */ +int spdk_accel_batch_prep_copy(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *dst, void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, + void *cb_arg); + +/** + * Synchronous call to prepare a dualcast request into a previously initialized batch + * created with spdk_accel_batch_create(). The callback will be called when the dualcast + * completes after the batch has been submitted by an asynchronous call to + * spdk_accel_batch_submit(). + * + * \param ch I/O channel associated with this call. + * \param batch Handle provided when the batch was started with spdk_accel_batch_create(). + * \param dst1 First destination to copy to (must be 4K aligned). + * \param dst2 Second destination to copy to (must be 4K aligned). + * \param src Source to copy from. + * \param nbytes Length in bytes to copy. + * \param cb_fn Called when this operation completes. + * \param cb_arg Callback argument. * * \return 0 on success, negative errno on failure. */ -int spdk_accel_submit_copy(struct spdk_accel_task *accel_req, struct spdk_io_channel *ch, void *dst, - void *src, uint64_t nbytes, spdk_accel_completion_cb cb); +int spdk_accel_batch_prep_dualcast(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *dst1, void *dst2, void *src, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg); /** * Submit a dual cast copy request. * - * \param accel_req Accel request task. - * \param ch I/O channel to submit request to the accel engine. This channel can - * be obtained by the function spdk_accel_engine_get_io_channel(). + * \param ch I/O channel associated with this call. * \param dst1 First destination to copy to (must be 4K aligned). * \param dst2 Second destination to copy to (must be 4K aligned). * \param src Source to copy from. * \param nbytes Length in bytes to copy. - * \param cb Called when this copy operation completes. + * \param cb_fn Called when this copy operation completes. + * \param cb_arg Callback argument. * * \return 0 on success, negative errno on failure. */ -int spdk_accel_submit_dualcast(struct spdk_accel_task *accel_req, struct spdk_io_channel *ch, - void *dst1, void *dst2, void *src, uint64_t nbytes, - spdk_accel_completion_cb cb); +int spdk_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2, void *src, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); + +/** + * Synchronous call to prepare a compare request into a previously initialized batch + * created with spdk_accel_batch_create(). The callback will be called when the comapre + * completes after the batch has been submitted by an asynchronous call to + * spdk_accel_batch_submit(). + * + * \param ch I/O channel associated with this call. + * \param batch Handle provided when the batch was started with spdk_accel_batch_create(). + * \param src1 First location to perform compare on. + * \param src2 Second location to perform compare on. + * \param nbytes Length in bytes to compare. + * \param cb_fn Called when this operation completes. + * \param cb_arg Callback argument. + * + * \return 0 on success, negative errno on failure. + */ +int spdk_accel_batch_prep_compare(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *src1, void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn, + void *cb_arg); /** * Submit a compare request. * - * \param accel_req Accel request task. - * \param ch I/O channel to submit request to the accel engine. This channel can - * be obtained by the function spdk_accel_engine_get_io_channel(). + * \param ch I/O channel associated with this call. * \param src1 First location to perform compare on. * \param src2 Second location to perform compare on. * \param nbytes Length in bytes to compare. - * \param cb Called when this compare operation completes. + * \param cb_fn Called when this compare operation completes. + * \param cb_arg Callback argument. * * \return 0 on success, any other value means there was a miscompare. */ -int spdk_accel_submit_compare(struct spdk_accel_task *accel_req, struct spdk_io_channel *ch, - void *src1, void *src2, uint64_t nbytes, - spdk_accel_completion_cb cb); +int spdk_accel_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg); + +/** + * Synchronous call to prepare a fill request into a previously initialized batch + * created with spdk_accel_batch_create(). The callback will be called when the fill + * completes after the batch has been submitted by an asynchronous call to + * spdk_accel_batch_submit(). + * + * \param ch I/O channel associated with this call. + * \param batch Handle provided when the batch was started with spdk_accel_batch_create(). + * \param dst Destination to fill. + * \param fill Constant byte to fill to the destination. + * \param nbytes Length in bytes to fill. + * \param cb_fn Called when this operation completes. + * \param cb_arg Callback argument. + * + * \return 0 on success, negative errno on failure. + */ +int spdk_accel_batch_prep_fill(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *dst, uint8_t fill, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg); /** * Submit a fill request. * * This operation will fill the destination buffer with the specified value. * - * \param accel_req Accel request task. - * \param ch I/O channel to submit request to the accel engine. This channel can - * be obtained by the function spdk_accel_engine_get_io_channel(). + * \param ch I/O channel associated with this call. * \param dst Destination to fill. * \param fill Constant byte to fill to the destination. * \param nbytes Length in bytes to fill. - * \param cb Called when this fill operation completes. + * \param cb_fn Called when this fill operation completes. + * \param cb_arg Callback argument. * * \return 0 on success, negative errno on failure. */ -int spdk_accel_submit_fill(struct spdk_accel_task *accel_req, struct spdk_io_channel *ch, - void *dst, uint8_t fill, uint64_t nbytes, spdk_accel_completion_cb cb); +int spdk_accel_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg); /** - * Submit a CRC-32C calculation request. + * Synchronous call to prepare a crc32c request into a previously initialized batch + * created with spdk_accel_batch_create(). The callback will be called when the crc32c + * completes after the batch has been submitted by an asynchronous call to + * spdk_accel_batch_submit(). * - * This operation will calculate the 4 byte CRC32-C for the given data. - * - * \param accel_req Accel request task. - * \param ch I/O channel to submit request to the accel engine. This channel can - * be obtained by the function spdk_accel_engine_get_io_channel(). + * \param ch I/O channel associated with this call. + * \param batch Handle provided when the batch was started with spdk_accel_batch_create(). * \param dst Destination to write the CRC-32C to. * \param src The source address for the data. * \param seed Four byte seed value. * \param nbytes Length in bytes. - * \param cb Called when this CRC-32C operation completes. + * \param cb_fn Called when this operation completes. + * \param cb_arg Callback argument. * * \return 0 on success, negative errno on failure. */ -int spdk_accel_submit_crc32c(struct spdk_accel_task *accel_req, struct spdk_io_channel *ch, - uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb); +int spdk_accel_batch_prep_crc32c(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg); /** - * Get the size of an acceleration task. + * Submit a CRC-32C calculation request. + * + * This operation will calculate the 4 byte CRC32-C for the given data. + * + * \param ch I/O channel associated with this call. + * \param dst Destination to write the CRC-32C to. + * \param src The source address for the data. + * \param seed Four byte seed value. + * \param nbytes Length in bytes. + * \param cb_fn Called when this CRC-32C operation completes. + * \param cb_arg Callback argument. * - * \return the size of acceleration task. + * \return 0 on success, negative errno on failure. */ -size_t spdk_accel_task_size(void); +int spdk_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src, uint32_t seed, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); struct spdk_json_write_ctx; diff --git a/include/spdk/bdev.h b/include/spdk/bdev.h index 0bb39c41058..5fcbe71ccf1 100644 --- a/include/spdk/bdev.h +++ b/include/spdk/bdev.h @@ -193,6 +193,14 @@ void spdk_bdev_get_opts(struct spdk_bdev_opts *opts); int spdk_bdev_set_opts(struct spdk_bdev_opts *opts); +/** + * Examine a block device explicitly + * + * \param name the name or alias of the block device + * \return 0 if block device was examined successfully, suitable errno value otherwise + */ +int spdk_bdev_examine(const char *name); + /** * Block device initialization callback. * diff --git a/include/spdk/bit_pool.h b/include/spdk/bit_pool.h new file mode 100644 index 00000000000..29357959b9f --- /dev/null +++ b/include/spdk/bit_pool.h @@ -0,0 +1,191 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * Bit pool data structure + */ + +#ifndef SPDK_BIT_POOL_H +#define SPDK_BIT_POOL_H + +#include "spdk/stdinc.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct spdk_bit_pool; +struct spdk_bit_array; + +/** + * Return the number of bits that a bit pool is currently sized to hold. + * + * \param pool Bit pool to query. + * + * \return the number of bits. + */ +uint32_t spdk_bit_pool_capacity(const struct spdk_bit_pool *pool); + +/** + * Create a bit pool. + * + * All bits in the pool will be available for allocation. + * + * \param num_bits Number of bits that the bit pool is sized to hold. + * + * \return a pointer to the new bit pool. + */ +struct spdk_bit_pool *spdk_bit_pool_create(uint32_t num_bits); + +/** + * Create a bit pool from an existing spdk_bit_array. + * + * The starting state of the bit pool will be specified by the state + * of the specified spdk_bit_array. + * + * The new spdk_bit_pool will consume the spdk_bit_array and assumes + * responsibility for freeing it. The caller should not use the + * spdk_bit_array after this function returns. + * + * \param array spdk_bit_array representing the starting state of the new bit pool. + * + * \return a pointer to the new bit pool, NULL if one could not be created (in which + * case the caller maintains responsibility for the spdk_bit_array) + */ +struct spdk_bit_pool *spdk_bit_pool_create_from_array(struct spdk_bit_array *array); + +/** + * Free a bit pool and set the pointer to NULL. + * + * \param pool Bit pool to free. + */ +void spdk_bit_pool_free(struct spdk_bit_pool **pool); + +/** + * Create or resize a bit pool. + * + * To create a new bit pool, pass a pointer to a spdk_bit_pool pointer that is + * NULL. + * + * The bit pool will be sized to hold at least num_bits. + * + * If num_bits is larger than the previous size of the bit pool, + * the new bits will all be available for future allocations. + * + * \param pool Bit pool to create/resize. + * \param num_bits Number of bits that the bit pool is sized to hold. + * + * \return 0 on success, negative errno on failure. + */ +int spdk_bit_pool_resize(struct spdk_bit_pool **pool, uint32_t num_bits); + +/** + * Return whether the specified bit has been allocated from the bit pool. + * + * If bit_index is beyond the end of the current size of the bit pool, this + * function will return false (i.e. bits beyond the end of the pool cannot be allocated). + * + * \param pool Bit pool to query. + * \param bit_index The index of a bit to query. + * + * \return true if the bit has been allocated, false otherwise + */ +bool spdk_bit_pool_is_allocated(const struct spdk_bit_pool *pool, uint32_t bit_index); + +/** + * Allocate a bit from the bit pool. + * + * \param pool Bit pool to allocate a bit from + * + * \return index of the allocated bit, UINT32_MAX if no free bits exist + */ +uint32_t spdk_bit_pool_allocate_bit(struct spdk_bit_pool *pool); + +/** + * Free a bit back to the bit pool. + * + * Callers must not try to free a bit that has not been allocated, otherwise the + * pool may become corrupted without notification. Freeing a bit that has not + * been allocated will result in an assert in debug builds. + * + * \param pool Bit pool to place the freed bit + * \param bit_index The index of a bit to free. + */ +void spdk_bit_pool_free_bit(struct spdk_bit_pool *pool, uint32_t bit_index); + +/** + * Count the number of bits allocated from the pool. + * + * \param pool The bit pool to count. + * + * \return the number of bits allocated from the pool. + */ +uint32_t spdk_bit_pool_count_allocated(const struct spdk_bit_pool *pool); + +/** + * Count the number of free bits in the pool. + * + * \param pool The bit pool to count. + * + * \return the number of free bits in the pool. + */ +uint32_t spdk_bit_pool_count_free(const struct spdk_bit_pool *pool); + +/** + * Store bitmask from bit pool. + * + * \param pool Bit pool. + * \param mask Destination mask. Mask and bit array pool must be equal. + */ +void spdk_bit_pool_store_mask(const struct spdk_bit_pool *pool, void *mask); + +/** + * Load bitmask to bit pool. + * + * \param pool Bit pool. + * \param mask Source mask. Mask and bit array pool must be equal. + */ +void spdk_bit_pool_load_mask(struct spdk_bit_pool *pool, const void *mask); + +/** + * Free all bits back into the bit pool. + * + * \param pool Bit pool. + */ +void spdk_bit_pool_free_all_bits(struct spdk_bit_pool *pool); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/spdk/conf.h b/include/spdk/conf.h index 51cbd8d694e..4a5292d32b0 100644 --- a/include/spdk/conf.h +++ b/include/spdk/conf.h @@ -201,6 +201,13 @@ bool spdk_conf_section_get_boolval(struct spdk_conf_section *sp, const char *key */ void spdk_conf_set_as_default(struct spdk_conf *cp); +/** + * Disable sections merging during 'spdk_conf_read()' + * + * \param cp Configuration to be read + */ +void spdk_conf_disable_sections_merge(struct spdk_conf *cp); + #ifdef __cplusplus } #endif diff --git a/include/spdk/env.h b/include/spdk/env.h index 4d522b9417c..0a72627ea8d 100644 --- a/include/spdk/env.h +++ b/include/spdk/env.h @@ -41,6 +41,7 @@ #include "spdk/stdinc.h" #include "spdk/queue.h" +#include "spdk/pci_ids.h" #ifdef __cplusplus extern "C" { @@ -84,6 +85,7 @@ struct spdk_env_opts { const char *hugedir; struct spdk_pci_addr *pci_blacklist; struct spdk_pci_addr *pci_whitelist; + const char *iova_mode; uint64_t base_virtaddr; /** Opaque context for use of the env implementation. */ @@ -158,8 +160,11 @@ void spdk_free(void *buf); void spdk_env_opts_init(struct spdk_env_opts *opts); /** - * Initialize the environment library. This must be called prior to using - * any other functions in this library. + * Initialize or reinitialize the environment library. + * For initialization, this must be called prior to using any other functions + * in this library. For reinitialization, the parameter `opts` must be set to + * NULL and this must be called after the environment library was finished by + * spdk_env_fini() within the same process. * * \param opts Environment initialization options. * \return 0 on success, or negative errno on failure. @@ -167,10 +172,11 @@ void spdk_env_opts_init(struct spdk_env_opts *opts); int spdk_env_init(const struct spdk_env_opts *opts); /** - * Release any resources of the environment library that were alllocated with + * Release any resources of the environment library that were allocated with * spdk_env_init(). After this call, no SPDK env function calls may be made. * It is expected that common usage of this function is to call it just before - * terminating the process. + * terminating the process or before reinitializing the environment library + * within the same process. */ void spdk_env_fini(void); @@ -644,12 +650,20 @@ struct spdk_pci_addr { }; struct spdk_pci_id { - uint16_t vendor_id; - uint16_t device_id; - uint16_t subvendor_id; - uint16_t subdevice_id; + uint32_t class_id; /**< Class ID or SPDK_PCI_CLASS_ANY_ID. */ + uint16_t vendor_id; /**< Vendor ID or SPDK_PCI_ANY_ID. */ + uint16_t device_id; /**< Device ID or SPDK_PCI_ANY_ID. */ + uint16_t subvendor_id; /**< Subsystem vendor ID or SPDK_PCI_ANY_ID. */ + uint16_t subdevice_id; /**< Subsystem device ID or SPDK_PCI_ANY_ID. */ }; +/** Device needs PCI BAR mapping (done with either IGB_UIO or VFIO) */ +#define SPDK_PCI_DRIVER_NEED_MAPPING 0x0001 +/** Device needs PCI BAR mapping with enabled write combining (wc) */ +#define SPDK_PCI_DRIVER_WC_ACTIVATE 0x0002 + +void spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags); + struct spdk_pci_device { struct spdk_pci_device *parent; void *dev_handle; @@ -666,7 +680,6 @@ struct spdk_pci_device { uint32_t len, uint32_t offset); int (*cfg_write)(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset); - void (*detach)(struct spdk_pci_device *dev); struct _spdk_pci_device_internal { struct spdk_pci_driver *driver; @@ -686,12 +699,18 @@ struct spdk_pci_device { typedef int (*spdk_pci_enum_cb)(void *enum_ctx, struct spdk_pci_device *pci_dev); -/** - * Get the NVMe PCI driver object. - * - * \return PCI driver. - */ -struct spdk_pci_driver *spdk_pci_nvme_get_driver(void); +#define SPDK_PCI_DEVICE(vend, dev) \ + .class_id = SPDK_PCI_CLASS_ANY_ID, \ + .vendor_id = (vend), \ + .device_id = (dev), \ + .subvendor_id = SPDK_PCI_ANY_ID, \ + .subdevice_id = SPDK_PCI_ANY_ID + +#define SPDK_PCI_DRIVER_REGISTER(name, id_table, flags) \ +__attribute__((constructor)) static void _spdk_pci_driver_register_##name(void) \ +{ \ + spdk_pci_driver_register(#name, id_table, flags); \ +} /** * Get the VMD PCI driver object. @@ -721,6 +740,18 @@ struct spdk_pci_driver *spdk_pci_idxd_get_driver(void); */ struct spdk_pci_driver *spdk_pci_virtio_get_driver(void); +/** + * Get PCI driver by name (e.g. "nvme", "vmd", "ioat"). + */ +struct spdk_pci_driver *spdk_pci_get_driver(const char *name); + +/** + * Get the NVMe PCI driver object. + * + * \return PCI driver. + */ +struct spdk_pci_driver *spdk_pci_nvme_get_driver(void); + /** * Enumerate all PCI devices supported by the provided driver and try to * attach those that weren't attached yet. The provided callback will be diff --git a/include/spdk/event.h b/include/spdk/event.h index 484bfccca3a..ea870fe9fcb 100644 --- a/include/spdk/event.h +++ b/include/spdk/event.h @@ -113,6 +113,7 @@ struct spdk_app_opts { size_t num_pci_addr; struct spdk_pci_addr *pci_blacklist; struct spdk_pci_addr *pci_whitelist; + const char *iova_mode; /* DEPRECATED. No longer has any effect. * @@ -139,6 +140,7 @@ struct spdk_app_opts { */ logfunc *log; + uint64_t base_virtaddr; }; /** diff --git a/include/spdk/idxd.h b/include/spdk/idxd.h index 266d92c1985..cb9ebe8b81c 100644 --- a/include/spdk/idxd.h +++ b/include/spdk/idxd.h @@ -32,7 +32,7 @@ */ /** \file - * IDXD accel engine driver public interface + * IDXD driver public interface */ #ifndef SPDK_IDXD_H @@ -56,6 +56,11 @@ struct spdk_idxd_io_channel; */ struct spdk_idxd_device; +/** + * Opaque handle for batching. + */ +struct idxd_batch; + /** * Signature for configuring a channel * @@ -138,7 +143,88 @@ void spdk_idxd_detach(struct spdk_idxd_device *idxd); void spdk_idxd_set_config(uint32_t config_number); /** - * Build and submit a accel engine memory copy request. + * Return the max number of descriptors per batch for IDXD. + * + * \return max number of desciptors per batch. + */ +uint32_t spdk_idxd_batch_get_max(void); + +/** + * Create a batch sequence. + * + * \param chan IDXD channel to submit request. + * + * \return handle to use for subsequent batch requests, NULL on failure. + */ +struct idxd_batch *spdk_idxd_batch_create(struct spdk_idxd_io_channel *chan); + +/** + * Submit a batch sequence. + * + * \param chan IDXD channel to submit request. + * \param batch Handle provided when the batch was started with spdk_idxd_batch_create(). + * \param cb_fn Callback function which will be called when the request is complete. + * \param cb_arg Opaque value which will be passed back as the arg parameter in + * the completion callback. + * + * \return 0 on success, negative errno on failure. + */ +int spdk_idxd_batch_submit(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + spdk_idxd_req_cb cb_fn, void *cb_arg); + +/** + * Cancel a batch sequence. + * + * \param chan IDXD channel to submit request. + * \param batch Handle provided when the batch was started with spdk_idxd_batch_create(). + * + * \return 0 on success, negative errno on failure. + */ +int spdk_idxd_batch_cancel(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch); + +/** + * Synchronous call to prepare a copy request into a previously initialized batch + * created with spdk_idxd_batch_create(). The callback will be called when the copy + * completes after the batch has been submitted by an asynchronous call to + * spdk_idxd_batch_submit(). + * + * \param chan IDXD channel to submit request. + * \param batch Handle provided when the batch was started with spdk_idxd_batch_create(). + * \param dst Destination virtual address. + * \param src Source virtual address. + * \param nbytes Number of bytes to copy. + * \param cb_fn Callback function which will be called when the request is complete. + * \param cb_arg Opaque value which will be passed back as the arg parameter in + * the completion callback. + * + * \return 0 on success, negative errno on failure. + */ +int spdk_idxd_batch_prep_copy(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + void *dst, const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg); + +/** + * Synchronous call to prepare a dualcast request into a previously initialized batch + * created with spdk_idxd_batch_create(). The callback will be called when the dualcast + * completes after the batch has been submitted by an asynchronous call to + * spdk_idxd_batch_submit(). + * + * \param chan IDXD channel to submit request. + * \param batch Handle provided when the batch was started with spdk_idxd_batch_create(). + * \param dst1 First destination virtual address (must be 4K aligned). + * \param dst2 Second destination virtual address (must be 4K aligned). + * \param src Source virtual address. + * \param nbytes Number of bytes to copy. + * \param cb_fn Callback function which will be called when the request is complete. + * \param cb_arg Opaque value which will be passed back as the arg parameter in + * the completion callback. + * + * \return 0 on success, negative errno on failure. + */ +int spdk_idxd_batch_prep_dualcast(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + void *dst1, void *dst2, const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg); + +/** + * Build and submit an idxd memory copy request. * * This function will build the copy descriptor and then immediately submit * by writing to the proper device portal. @@ -158,7 +244,7 @@ int spdk_idxd_submit_copy(struct spdk_idxd_io_channel *chan, spdk_idxd_req_cb cb_fn, void *cb_arg); /** - * Build and submit an accel engine dual cast copy request. + * Build and submit an idxd dualcast request. * * This function will build the dual cast descriptor and then immediately submit * by writing to the proper device portal. @@ -178,6 +264,27 @@ int spdk_idxd_submit_dualcast(struct spdk_idxd_io_channel *chan, void *dst1, void *dst2, const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg); +/** + * Synchronous call to prepare a compare request into a previously initialized batch + * created with spdk_idxd_batch_create(). The callback will be called when the compare + * completes after the batch has been submitted by an asynchronous call to + * spdk_idxd_batch_submit(). + * + * \param chan IDXD channel to submit request. + * \param batch Handle provided when the batch was started with spdk_idxd_batch_create(). + * \param src1 First source to compare. + * \param src2 Second source to compare. + * \param nbytes Number of bytes to compare. + * \param cb_fn Callback function which will be called when the request is complete. + * \param cb_arg Opaque value which will be passed back as the arg parameter in + * the completion callback. + * + * \return 0 on success, negative errno on failure. + */ +int spdk_idxd_batch_prep_compare(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + void *src1, void *src2, uint64_t nbytes, spdk_idxd_req_cb cb_fn, + void *cb_arg); + /** * Build and submit a memory compare request. * @@ -199,7 +306,27 @@ int spdk_idxd_submit_compare(struct spdk_idxd_io_channel *chan, spdk_idxd_req_cb cb_fn, void *cb_arg); /** - * Build and submit a accel engine memory fill request. + * Synchronous call to prepare a fill request into a previously initialized batch + * created with spdk_idxd_batch_create(). The callback will be called when the fill + * completes after the batch has been submitted by an asynchronous call to + * spdk_idxd_batch_submit(). + * + * \param chan IDXD channel to submit request. + * \param batch Handle provided when the batch was started with spdk_idxd_batch_create(). + * \param dst Destination virtual address. + * \param fill_pattern Repeating eight-byte pattern to use for memory fill. + * \param nbytes Number of bytes to fill. + * \param cb_fn Callback function which will be called when the request is complete. + * \param cb_arg Opaque value which will be passed back as the arg parameter in + * the completion callback. + * + * \return 0 on success, negative errno on failure. + */ +int spdk_idxd_batch_prep_fill(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + void *dst, uint64_t fill_pattern, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg); + +/** + * Build and submit a idxd memory fill request. * * This function will build the fill descriptor and then immediately submit * by writing to the proper device portal. @@ -218,6 +345,28 @@ int spdk_idxd_submit_fill(struct spdk_idxd_io_channel *chan, void *dst, uint64_t fill_pattern, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg); +/** + * Synchronous call to prepare a crc32c request into a previously initialized batch + * created with spdk_idxd_batch_create(). The callback will be called when the crc32c + * completes after the batch has been submitted by an asynchronous call to + * spdk_idxd_batch_submit(). + * + * \param chan IDXD channel to submit request. + * \param batch Handle provided when the batch was started with spdk_idxd_batch_create(). + * \param dst Resulting calculation. + * \param src Source virtual address. + * \param seed Four byte CRC-32C seed value. + * \param nbytes Number of bytes to calculate on. + * \param cb_fn Callback function which will be called when the request is complete. + * \param cb_arg Opaque value which will be passed back as the arg parameter in + * the completion callback. + * + * \return 0 on success, negative errno on failure. + */ +int spdk_idxd_batch_prep_crc32c(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes, + spdk_idxd_req_cb cb_fn, void *cb_arg); + /** * Build and submit a memory CRC32-C request. * diff --git a/include/spdk/ioat.h b/include/spdk/ioat.h index fc1750957de..c4e66be3b97 100644 --- a/include/spdk/ioat.h +++ b/include/spdk/ioat.h @@ -107,6 +107,15 @@ int spdk_ioat_probe(void *cb_ctx, spdk_ioat_probe_cb probe_cb, spdk_ioat_attach_ */ void spdk_ioat_detach(struct spdk_ioat_chan *ioat); +/** + * Get the maximum number of descriptors supported by the library. + * + * \param chan I/OAT channel + * + * \return maximum number of descriptors. + */ +uint32_t spdk_ioat_get_max_descriptors(struct spdk_ioat_chan *chan); + /** * Build a DMA engine memory copy request. * @@ -207,7 +216,7 @@ void spdk_ioat_flush(struct spdk_ioat_chan *chan); * * \param chan I/OAT channel to check for completions. * - * \return 0 on success, negative errno on failure. + * \return number of events handled on success, negative errno on failure. */ int spdk_ioat_process_events(struct spdk_ioat_chan *chan); diff --git a/include/spdk/log.h b/include/spdk/log.h index b81233d0efc..241250488a8 100644 --- a/include/spdk/log.h +++ b/include/spdk/log.h @@ -70,6 +70,11 @@ void spdk_log_open(logfunc *logf); */ void spdk_log_close(void); +/** + * Enable or disable timestamps + */ +void spdk_log_enable_timestamps(bool value); + enum spdk_log_level { /** All messages will be suppressed. */ SPDK_LOG_DISABLED = -1, @@ -95,26 +100,6 @@ void spdk_log_set_level(enum spdk_log_level level); */ enum spdk_log_level spdk_log_get_level(void); -/** - * Set the log level threshold to include stack trace in log messages. - * Messages with a higher level than this will not contain stack trace. You - * can use \c SPDK_LOG_DISABLED to completely disable stack trace printing - * even if it is supported. - * - * \note This function has no effect if SPDK is built without stack trace - * printing support. - * - * \param level Log level threshold for stacktrace. - */ -void spdk_log_set_backtrace_level(enum spdk_log_level level); - -/** - * Get the current log level threshold for showing stack trace in log message. - * - * \return the current log level threshold for stack trace. - */ -enum spdk_log_level spdk_log_get_backtrace_level(void); - /** * Set the current log level threshold for printing to stderr. * Messages with a level less than or equal to this level @@ -132,6 +117,12 @@ void spdk_log_set_print_level(enum spdk_log_level level); */ enum spdk_log_level spdk_log_get_print_level(void); +#ifdef DEBUG +#define SPDK_DEBUGLOG_FLAG_ENABLED(name) spdk_log_get_flag(name) +#else +#define SPDK_DEBUGLOG_FLAG_ENABLED(name) false +#endif + #define SPDK_NOTICELOG(...) \ spdk_log(SPDK_LOG_NOTICE, __FILE__, __LINE__, __func__, __VA_ARGS__) #define SPDK_WARNLOG(...) \ diff --git a/include/spdk/nvme.h b/include/spdk/nvme.h index 95313a35586..30d7360e2bd 100644 --- a/include/spdk/nvme.h +++ b/include/spdk/nvme.h @@ -52,6 +52,7 @@ extern "C" { #define SPDK_NVME_TRANSPORT_NAME_PCIE "PCIE" #define SPDK_NVME_TRANSPORT_NAME_RDMA "RDMA" #define SPDK_NVME_TRANSPORT_NAME_TCP "TCP" +#define SPDK_NVME_TRANSPORT_NAME_CUSTOM "CUSTOM" #define SPDK_NVMF_PRIORITY_MAX_LEN 4 @@ -248,6 +249,12 @@ struct spdk_nvme_ctrlr_opts { * structure are valid. And the library will populate any remaining fields with default values. */ size_t opts_size; + + /** + * The amount of time to spend before timing out during fabric connect on qpairs associated with + * this controller in microseconds. + */ + uint64_t fabrics_connect_timeout_us; }; /** @@ -1601,6 +1608,25 @@ int spdk_nvme_ctrlr_cmd_abort(struct spdk_nvme_ctrlr *ctrlr, spdk_nvme_cmd_cb cb_fn, void *cb_arg); +/** + * Abort previously submitted commands which have cmd_cb_arg as its callback argument. + * + * \param ctrlr NVMe controller to which the commands were submitted. + * \param qpair NVMe queue pair to which the commands were submitted. For admin + * commands, pass NULL for the qpair. + * \param cmd_cb_arg Callback argument for the NVMe commands which this function + * attempts to abort. + * \param cb_fn Callback function to invoke when this function has completed. + * \param cb_arg Argument to pass to the callback function. + * + * \return 0 if successfully submitted, negated errno otherwise. + */ +int spdk_nvme_ctrlr_cmd_abort_ext(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + void *cmd_cb_arg, + spdk_nvme_cmd_cb cb_fn, + void *cb_arg); + /** * Set specific feature for the given NVMe controller. * @@ -1967,6 +1993,27 @@ void spdk_nvme_ctrlr_unmap_cmb(struct spdk_nvme_ctrlr *ctrlr); const struct spdk_nvme_transport_id *spdk_nvme_ctrlr_get_transport_id( struct spdk_nvme_ctrlr *ctrlr); +/** + * \brief Alloc NVMe I/O queue identifier. + * + * This function is only needed for the non-standard case of allocating queues using the raw + * command interface. In most cases \ref spdk_nvme_ctrlr_alloc_io_qpair should be sufficient. + * + * \param ctrlr Opaque handle to NVMe controller. + * \return qid on success, -1 on failure. + */ +int32_t spdk_nvme_ctrlr_alloc_qid(struct spdk_nvme_ctrlr *ctrlr); + +/** + * \brief Free NVMe I/O queue identifier. + * + * This function must only be called with qids previously allocated with \ref spdk_nvme_ctrlr_alloc_qid. + * + * \param ctrlr Opaque handle to NVMe controller. + * \param qid NVMe Queue Identifier. + */ +void spdk_nvme_ctrlr_free_qid(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid); + /** * Opaque handle for a poll group. A poll group is a collection of spdk_nvme_qpair * objects that are polled for completions as a unit. @@ -2248,6 +2295,15 @@ uint32_t spdk_nvme_ns_get_optimal_io_boundary(struct spdk_nvme_ns *ns); */ const struct spdk_uuid *spdk_nvme_ns_get_uuid(const struct spdk_nvme_ns *ns); +/** + * Get the Command Set Identifier for the given namespace. + * + * \param ns Namespace to query. + * + * \return the namespace Command Set Identifier. + */ +enum spdk_nvme_csi spdk_nvme_ns_get_csi(const struct spdk_nvme_ns *ns); + /** * \brief Namespace command support flags. */ @@ -2975,6 +3031,22 @@ void spdk_nvme_qpair_print_command(struct spdk_nvme_qpair *qpair, void spdk_nvme_qpair_print_completion(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cpl *cpl); +/** + * \brief Prints (SPDK_NOTICELOG) the contents of an NVMe submission queue entry (command). + * + * \param qid Queue identifier. + * \param cmd Pointer to the submission queue command to be formatted. + */ +void spdk_nvme_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd); + +/** + * \brief Prints (SPDK_NOTICELOG) the contents of an NVMe completion queue entry. + * + * \param qid Queue identifier. + * \param cpl Pointer to the completion queue element to be formatted. + */ +void spdk_nvme_print_completion(uint16_t qid, struct spdk_nvme_cpl *cpl); + struct ibv_context; struct ibv_pd; struct ibv_mr; @@ -3152,6 +3224,10 @@ struct spdk_nvme_transport_ops { int32_t (*qpair_process_completions)(struct spdk_nvme_qpair *qpair, uint32_t max_completions); + int (*qpair_iterate_requests)(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg); + void (*admin_qpair_abort_aers)(struct spdk_nvme_qpair *qpair); struct spdk_nvme_transport_poll_group *(*poll_group_create)(void); diff --git a/include/spdk/nvme_spec.h b/include/spdk/nvme_spec.h index ab5dd97c5e6..91166e3ba44 100644 --- a/include/spdk/nvme_spec.h +++ b/include/spdk/nvme_spec.h @@ -117,13 +117,20 @@ SPDK_STATIC_ASSERT(sizeof(union spdk_nvme_cap_register) == 8, "Incorrect size"); /** * I/O Command Set Selected * - * Only a single command set is defined as of NVMe 1.3 (NVM). + * Only a single command set is defined as of NVMe 1.3 (NVM). Later, it became + * possible to disable I/O Command Sets, that is, configuring it to only use the + * Admin Command Set. With 1.4c and Namespace Types, additional I/O Command Sets + * are available. */ enum spdk_nvme_cc_css { SPDK_NVME_CC_CSS_NVM = 0x0, /**< NVM command set */ + SPDK_NVME_CC_CSS_IOCS = 0x6, /**< One or more I/O command sets */ + SPDK_NVME_CC_CSS_NOIO = 0x7, /**< No I/O, only admin */ }; #define SPDK_NVME_CAP_CSS_NVM (1u << SPDK_NVME_CC_CSS_NVM) /**< NVM command set supported */ +#define SPDK_NVME_CAP_CSS_IOCS (1u << SPDK_NVME_CC_CSS_IOCS) /**< One or more I/O Command sets supported */ +#define SPDK_NVME_CAP_CSS_NOIO (1u << SPDK_NVME_CC_CSS_NOIO) /**< No I/O, only admin */ union spdk_nvme_cc_register { uint32_t raw; @@ -708,7 +715,8 @@ union spdk_nvme_feat_async_event_configuration { uint32_t ns_attr_notice : 1; uint32_t fw_activation_notice : 1; uint32_t telemetry_log_notice : 1; - uint32_t reserved : 21; + uint32_t ana_change_notice : 1; + uint32_t reserved : 20; } bits; }; SPDK_STATIC_ASSERT(sizeof(union spdk_nvme_feat_async_event_configuration) == 4, "Incorrect size"); @@ -972,6 +980,14 @@ SPDK_STATIC_ASSERT(sizeof(union spdk_nvme_cmd_cdw10) == 4, "Incorrect size"); union spdk_nvme_cmd_cdw11 { uint32_t raw; + struct { + /* NVM Set Identifier */ + uint32_t nvmsetid : 16; + uint32_t reserved : 8; + /* Command Set Identifier */ + uint32_t csi : 8; + } identify; + struct { /* Physically Contiguous */ uint32_t pc : 1; @@ -1234,6 +1250,11 @@ enum spdk_nvme_command_specific_status_code { SPDK_NVME_SC_INVALID_NUM_CTRLR_RESOURCES = 0x21, SPDK_NVME_SC_INVALID_RESOURCE_ID = 0x22, + SPDK_NVME_SC_IOCS_NOT_SUPPORTED = 0x29, + SPDK_NVME_SC_IOCS_NOT_ENABLED = 0x2a, + SPDK_NVME_SC_IOCS_COMBINATION_REJECTED = 0x2b, + SPDK_NVME_SC_INVALID_IOCS = 0x2c, + SPDK_NVME_SC_CONFLICTING_ATTRIBUTES = 0x80, SPDK_NVME_SC_INVALID_PROTECTION_INFO = 0x81, SPDK_NVME_SC_ATTEMPTED_WRITE_TO_RO_RANGE = 0x82, @@ -1258,6 +1279,9 @@ enum spdk_nvme_media_error_status_code { */ enum spdk_nvme_path_status_code { SPDK_NVME_SC_INTERNAL_PATH_ERROR = 0x00, + SPDK_NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS = 0x01, + SPDK_NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE = 0x02, + SPDK_NVME_SC_ASYMMETRIC_ACCESS_TRANSITION = 0x03, SPDK_NVME_SC_CONTROLLER_PATH_ERROR = 0x60, @@ -1465,6 +1489,15 @@ enum spdk_nvme_identify_cns { /** List namespace identification descriptors */ SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST = 0x03, + /** Identify namespace indicated in CDW1.NSID, specific to CWD11.CSI */ + SPDK_NVME_IDENTIFY_NS_IOCS = 0x05, + + /** Identify controller, specific to CWD11.CSI */ + SPDK_NVME_IDENTIFY_CTRLR_IOCS = 0x06, + + /** List active NSIDs greater than CDW1.NSID, specific to CWD11.CSI */ + SPDK_NVME_IDENTIFY_ACTIVE_NS_LIST_IOCS = 0x07, + /** List allocated NSIDs greater than CDW1.NSID */ SPDK_NVME_IDENTIFY_ALLOCATED_NS_LIST = 0x10, @@ -1482,6 +1515,15 @@ enum spdk_nvme_identify_cns { /** Get secondary controller list */ SPDK_NVME_IDENTIFY_SECONDARY_CTRLR_LIST = 0x15, + + /** List allocated NSIDs greater than CDW1.NSID, specific to CWD11.CSI */ + SPDK_NVME_IDENTIFY_ALLOCATED_NS_LIST_IOCS = 0x1a, + + /** Identify namespace if CDW1.NSID is allocated, specific to CDWD11.CSI */ + SPDK_NVME_IDENTIFY_NS_ALLOCATED_IOCS = 0x1b, + + /** Identify I/O Command Sets */ + SPDK_NVME_IDENTIFY_IOCS = 0x1c, }; /** NVMe over Fabrics controller model */ @@ -1592,7 +1634,8 @@ struct __attribute__((packed)) __attribute__((aligned)) spdk_nvme_ctrlr_data { uint8_t multi_port : 1; uint8_t multi_host : 1; uint8_t sr_iov : 1; - uint8_t reserved : 5; + uint8_t ana_reporting : 1; + uint8_t reserved : 4; } cmic; /** maximum data transfer size */ @@ -1620,7 +1663,12 @@ struct __attribute__((packed)) __attribute__((aligned)) spdk_nvme_ctrlr_data { /** Supports sending Firmware Activation Notices. */ uint32_t fw_activation_notices : 1; - uint32_t reserved2 : 22; + uint32_t reserved2 : 1; + + /** Supports Asymmetric Namespace Access Change Notices. */ + uint32_t ana_change_notices : 1; + + uint32_t reserved3 : 20; } oaes; /** controller attributes */ @@ -1823,7 +1871,31 @@ struct __attribute__((packed)) __attribute__((aligned)) spdk_nvme_ctrlr_data { } bits; } sanicap; - uint8_t reserved3[180]; + /* bytes 332-342 */ + uint8_t reserved3[10]; + + /** ANA transition time */ + uint8_t anatt; + + /* bytes 343: Asymmetric namespace access capabilities */ + struct { + uint8_t ana_optimized_state : 1; + uint8_t ana_non_optimized_state : 1; + uint8_t ana_inaccessible_state : 1; + uint8_t ana_persistent_loss_state : 1; + uint8_t ana_change_state : 1; + uint8_t reserved : 1; + uint8_t no_change_anagrpid : 1; + uint8_t non_zero_anagrpid : 1; + } anacap; + + /* bytes 344-347: ANA group identifier maximum */ + uint32_t anagrpmax; + /* bytes 348-351: number of ANA group identifiers */ + uint32_t nanagrpid; + + /* bytes 352-511 */ + uint8_t reserved352[160]; /* bytes 512-703: nvm command set attributes */ @@ -1895,7 +1967,10 @@ struct __attribute__((packed)) __attribute__((aligned)) spdk_nvme_ctrlr_data { struct spdk_nvme_cdata_sgls sgls; - uint8_t reserved4[228]; + /* maximum number of allowed namespaces */ + uint32_t mnan; + + uint8_t reserved4[224]; uint8_t subnqn[SPDK_NVME_NQN_FIELD_SIZE]; @@ -2154,7 +2229,12 @@ struct spdk_nvme_ns_data { /** NVM capacity */ uint64_t nvmcap[2]; - uint8_t reserved64[40]; + uint8_t reserved64[28]; + + /** ANA group identifier */ + uint32_t anagrpid; + + uint8_t reserved96[8]; /** namespace globally unique identifier */ uint8_t nguid[16]; @@ -2398,7 +2478,12 @@ enum spdk_nvme_log_page { /** Controller initiated telemetry log (optional) */ SPDK_NVME_LOG_TELEMETRY_CTRLR_INITIATED = 0x08, - /* 0x09-0x6F - reserved */ + /* 0x09-0x0B - reserved */ + + /** Asymmetric namespace access log (optional) */ + SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS = 0x0C, + + /* 0x0D-0x6F - reserved */ /** Discovery(refer to the NVMe over Fabrics specification) */ SPDK_NVME_LOG_DISCOVERY = 0x70, @@ -2646,8 +2731,10 @@ enum spdk_nvme_async_event_info_notice { SPDK_NVME_ASYNC_EVENT_FW_ACTIVATION_START = 0x1, /* Telemetry Log Changed */ SPDK_NVME_ASYNC_EVENT_TELEMETRY_LOG_CHANGED = 0x2, + /* Asymmetric Namespace Access Change */ + SPDK_NVME_ASYNC_EVENT_ANA_CHANGE = 0x3, - /* 0x3 - 0xFF Reserved */ + /* 0x4 - 0xFF Reserved */ }; /** @@ -2694,6 +2781,40 @@ struct spdk_nvme_firmware_page { }; SPDK_STATIC_ASSERT(sizeof(struct spdk_nvme_firmware_page) == 512, "Incorrect size"); +/** + * Asymmetric Namespace Acccess page (\ref SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS) + */ +struct spdk_nvme_ana_page { + uint64_t change_count; + uint16_t num_ana_group_desc; + uint8_t reserved[6]; +}; +SPDK_STATIC_ASSERT(sizeof(struct spdk_nvme_ana_page) == 16, "Incorrect size"); + +/* Asymmetric namespace access state */ +enum spdk_nvme_ana_state { + SPDK_NVME_ANA_OPTIMIZED_STATE = 0x1, + SPDK_NVME_ANA_NON_OPTIMIZED_STATE = 0x2, + SPDK_NVME_ANA_INACCESSIBLE_STATE = 0x3, + SPDK_NVME_ANA_PERSISTENT_LOSS_STATE = 0x4, + SPDK_NVME_ANA_CHANGE_STATE = 0xF, +}; + +/* ANA group descriptor */ +struct spdk_nvme_ana_group_descriptor { + uint32_t ana_group_id; + uint32_t num_of_nsid; + uint64_t change_count; + + uint8_t ana_state : 4; + uint8_t reserved0 : 4; + + uint8_t reserved1[15]; + + uint32_t nsid[]; +}; +SPDK_STATIC_ASSERT(sizeof(struct spdk_nvme_ana_group_descriptor) == 32, "Incorrect size"); + /** * Namespace attachment Type Encoding */ @@ -2739,6 +2860,9 @@ enum spdk_nvme_nidt { /** Namespace UUID */ SPDK_NVME_NIDT_UUID = 0x03, + + /** Namespace Command Set Identifier */ + SPDK_NVME_NIDT_CSI = 0x04, }; struct spdk_nvme_ns_id_desc { @@ -2762,6 +2886,12 @@ struct spdk_nvme_ctrlr_list { }; SPDK_STATIC_ASSERT(sizeof(struct spdk_nvme_ctrlr_list) == 4096, "Incorrect size"); +enum spdk_nvme_csi { + SPDK_NVME_CSI_NVM = 0x0, + SPDK_NVME_CSI_KV = 0x1, + SPDK_NVME_CSI_ZNS = 0x2, +}; + enum spdk_nvme_secure_erase_setting { SPDK_NVME_FMT_NVM_SES_NO_SECURE_ERASE = 0x0, SPDK_NVME_FMT_NVM_SES_USER_DATA_ERASE = 0x1, @@ -2886,6 +3016,9 @@ SPDK_STATIC_ASSERT(sizeof(struct spdk_nvme_fw_commit) == 4, "Incorrect size"); (cpl)->status.sc == SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR || \ (cpl)->status.sc == SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR)) +#define spdk_nvme_cpl_is_abort_success(cpl) \ + (spdk_nvme_cpl_is_success(cpl) && !((cpl)->cdw0 & 1U)) + /** Set fused operation */ #define SPDK_NVME_IO_FLAGS_FUSE_FIRST (SPDK_NVME_CMD_FUSE_FIRST << 0) #define SPDK_NVME_IO_FLAGS_FUSE_SECOND (SPDK_NVME_CMD_FUSE_SECOND << 0) diff --git a/include/spdk/nvmf.h b/include/spdk/nvmf.h index 4057db1fabe..980dd08fee4 100644 --- a/include/spdk/nvmf.h +++ b/include/spdk/nvmf.h @@ -85,6 +85,10 @@ struct spdk_nvmf_transport_opts { bool c2h_success; bool dif_insert_or_strip; uint32_t sock_priority; + int acceptor_backlog; + uint32_t abort_timeout_sec; + /* ms */ + uint32_t association_timeout; }; struct spdk_nvmf_poll_group_stat { @@ -116,7 +120,9 @@ struct spdk_nvmf_transport_poll_group_stat { }; /** - * Function to be called once the listener is associated with a subsystem. + * Function to be called once asynchronous listen add and remove + * operations are completed. See spdk_nvmf_subsystem_add_listener() + * and spdk_nvmf_transport_stop_listen_async(). * * \param ctx Context argument passed to this function. * \param status 0 if it completed successfully, or negative errno if it failed. @@ -643,6 +649,20 @@ void spdk_nvmf_subsystem_allow_any_listener( bool spdk_nvmf_subsytem_any_listener_allowed( struct spdk_nvmf_subsystem *subsystem); +/** + * Set whether a subsystem supports Asymmetric Namespace Access (ANA) + * reporting. + * + * May only be performed on subsystems in the INACTIVE state. + * + * \param subsystem Subsystem to modify. + * \param ana_reporting true to support or false not to support ANA reporting. + * + * \return 0 on success, or negated errno value on failure. + */ +int spdk_nvmf_subsystem_set_ana_reporting(struct spdk_nvmf_subsystem *subsystem, + bool ana_reporting); + /** NVMe-oF target namespace creation options */ struct spdk_nvmf_ns_opts { /** @@ -986,6 +1006,26 @@ int spdk_nvmf_transport_stop_listen(struct spdk_nvmf_transport *transport, const struct spdk_nvme_transport_id *trid); +/** + * Stop accepting new connections at the provided address. + * + * This is a counterpart to spdk_nvmf_tgt_listen(). It differs + * from spdk_nvmf_transport_stop_listen() in that it also destroys all + * qpairs that are connected to the specified listener. Because + * this function disconnects the qpairs, it has to be asynchronous. + * + * \param transport The transport associated with the listen address. + * \param trid The address to stop listening at. + * \param cb_fn The function to call on completion. + * \param cb_arg The argument to pass to the cb_fn. + * + * \return int. 0 when the asynchronous process starts successfully or a negated errno on failure. + */ +int spdk_nvmf_transport_stop_listen_async(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid, + spdk_nvmf_tgt_subsystem_listen_done_fn cb_fn, + void *cb_arg); + /** * \brief Get current transport poll group statistics. * diff --git a/include/spdk/nvmf_cmd.h b/include/spdk/nvmf_cmd.h index b1c50057282..6cbac7de0bd 100644 --- a/include/spdk/nvmf_cmd.h +++ b/include/spdk/nvmf_cmd.h @@ -132,6 +132,19 @@ typedef void (*spdk_nvmf_nvme_passthru_cmd_cb)(struct spdk_nvmf_request *req); int spdk_nvmf_bdev_ctrlr_nvme_passthru_admin(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, struct spdk_nvmf_request *req, spdk_nvmf_nvme_passthru_cmd_cb cb_fn); +/** + * Attempts to abort a request in the specified bdev + * + * \param bdev Bdev that is processing req_to_abort + * \param desc Bdev desc + * \param ch Channel on which req_to_abort was originally submitted + * \param req Abort cmd req + * \param req_to_abort The request that should be aborted + */ +int spdk_nvmf_bdev_ctrlr_abort_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req, + struct spdk_nvmf_request *req_to_abort); + /** * Provide access to the underlying bdev that is associated with a namespace. * @@ -200,4 +213,14 @@ struct spdk_nvme_cmd *spdk_nvmf_request_get_cmd(struct spdk_nvmf_request *req); */ struct spdk_nvme_cpl *spdk_nvmf_request_get_response(struct spdk_nvmf_request *req); +/** + * Get the request to abort that is associated with this request. + * The req to abort is only set if the request processing a SPDK_NVME_OPC_ABORT cmd + * + * \param req The NVMe-oF abort request + * + * \return req_to_abort The NVMe-oF request that is in process of being aborted + */ +struct spdk_nvmf_request *spdk_nvmf_request_get_req_to_abort(struct spdk_nvmf_request *req); + #endif /* SPDK_NVMF_CMD_H_ */ diff --git a/include/spdk/nvmf_transport.h b/include/spdk/nvmf_transport.h index 28300ecb987..6aaae4ce8a4 100644 --- a/include/spdk/nvmf_transport.h +++ b/include/spdk/nvmf_transport.h @@ -94,6 +94,9 @@ struct spdk_nvmf_request { struct spdk_nvmf_dif_info dif; spdk_nvmf_nvme_passthru_cmd_cb cmd_cb_fn; struct spdk_nvmf_request *first_fused_req; + struct spdk_nvmf_request *req_to_abort; + struct spdk_poller *poller; + uint64_t timeout_tsc; STAILQ_ENTRY(spdk_nvmf_request) buf_link; TAILQ_ENTRY(spdk_nvmf_request) link; @@ -116,10 +119,12 @@ struct spdk_nvmf_qpair { struct spdk_nvmf_transport *transport; struct spdk_nvmf_ctrlr *ctrlr; struct spdk_nvmf_poll_group *group; + const struct spdk_nvme_transport_id *trid; uint16_t qid; uint16_t sq_head; uint16_t sq_head_max; + bool disconnect_started; struct spdk_nvmf_request *first_fused_req; @@ -237,17 +242,11 @@ struct spdk_nvmf_transport_ops { * action here, as the enforcement of the association is done in the generic * code. * - * The association is not considered complete until cb_fn is called. New - * connections on the listener targeting this subsystem will be rejected - * until that time. - * - * Pass a negated errno code to `cb_fn` to block the association. 0 to allow. + * Returns a negated errno code to block the association. 0 to allow. */ - void (*listen_associate)(struct spdk_nvmf_transport *transport, - const struct spdk_nvmf_subsystem *subsystem, - const struct spdk_nvme_transport_id *trid, - spdk_nvmf_tgt_subsystem_listen_done_fn cb_fn, - void *cb_arg); + int (*listen_associate)(struct spdk_nvmf_transport *transport, + const struct spdk_nvmf_subsystem *subsystem, + const struct spdk_nvme_transport_id *trid); /** * Check for new connections on the transport. @@ -334,6 +333,15 @@ struct spdk_nvmf_transport_ops { int (*qpair_get_listen_trid)(struct spdk_nvmf_qpair *qpair, struct spdk_nvme_transport_id *trid); + /* + * Abort the request which the abort request specifies. + * This function can complete synchronously or asynchronously, but + * is expected to call spdk_nvmf_request_complete() in the end + * for both cases. + */ + void (*qpair_abort_request)(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_request *req); + /* * Get transport poll group statistics */ @@ -396,7 +404,6 @@ int spdk_nvmf_request_get_buffers_multi(struct spdk_nvmf_request *req, bool spdk_nvmf_request_get_dif_ctx(struct spdk_nvmf_request *req, struct spdk_dif_ctx *dif_ctx); void spdk_nvmf_request_exec(struct spdk_nvmf_request *req); -void spdk_nvmf_request_exec_fabrics(struct spdk_nvmf_request *req); int spdk_nvmf_request_free(struct spdk_nvmf_request *req); int spdk_nvmf_request_complete(struct spdk_nvmf_request *req); @@ -417,6 +424,16 @@ void spdk_nvmf_poll_group_remove(struct spdk_nvmf_qpair *qpair); struct spdk_nvmf_subsystem * spdk_nvmf_ctrlr_get_subsystem(struct spdk_nvmf_ctrlr *ctrlr); +/** + * Get the NVMe-oF controller ID. + * + * \param ctrlr The NVMe-oF controller + * + * \return The NVMe-oF controller ID + */ +uint16_t +spdk_nvmf_ctrlr_get_id(struct spdk_nvmf_ctrlr *ctrlr); + static inline enum spdk_nvme_data_transfer spdk_nvmf_req_get_xfer(struct spdk_nvmf_request *req) { enum spdk_nvme_data_transfer xfer; diff --git a/include/spdk/pci_ids.h b/include/spdk/pci_ids.h index 8896e0905b1..816eb0a8443 100644 --- a/include/spdk/pci_ids.h +++ b/include/spdk/pci_ids.h @@ -53,6 +53,7 @@ extern "C" { #define SPDK_PCI_VID_CNEXLABS 0x1d1d #define SPDK_PCI_VID_VMWARE 0x15ad +#define SPDK_PCI_CLASS_ANY_ID 0xffffff /** * PCI class code for NVMe devices. * diff --git a/include/spdk/sock.h b/include/spdk/sock.h index 0acdd4f59c2..a51ba870693 100644 --- a/include/spdk/sock.h +++ b/include/spdk/sock.h @@ -41,6 +41,7 @@ #include "spdk/stdinc.h" #include "spdk/queue.h" +#include "spdk/json.h" #ifdef __cplusplus extern "C" { @@ -88,14 +89,35 @@ struct spdk_sock_request { */ struct spdk_sock_impl_opts { /** - * Size of sock receive buffer. Used by posix socket module. + * Size of sock receive buffer. Used by posix and uring socket modules. */ uint32_t recv_buf_size; /** - * Size of sock send buffer. Used by posix socket module. + * Size of sock send buffer. Used by posix and uring socket modules. */ uint32_t send_buf_size; + + /** + * Enable or disable receive pipe. Used by posix and uring socket modules. + */ + bool enable_recv_pipe; + + /** + * Enable or disable use of zero copy flow on send. Used by posix socket module. + */ + bool enable_zerocopy_send; + + /** + * Enable or disable quick ACK. Used by posix and uring socket modules. + */ + bool enable_quickack; + + /** + * Enable or disable placement_id. Used by posix and uring socket modules. + */ + bool enable_placement_id; + }; /** @@ -155,7 +177,7 @@ int spdk_sock_getaddr(struct spdk_sock *sock, char *saddr, int slen, uint16_t *s * \param impl_name The sock_implementation to use, such as "posix". If impl_name is * specified, it will *only* try to connect on that impl. If it is NULL, it will try * all the sock implementations in order and uses the first sock implementation which - * can connect. For example, it may try vpp first, then fall back to posix. + * can connect. * * \return a pointer to the connected socket on success, or NULL on failure. */ @@ -171,7 +193,7 @@ struct spdk_sock *spdk_sock_connect(const char *ip, int port, char *impl_name); * \param impl_name The sock_implementation to use, such as "posix". If impl_name is * specified, it will *only* try to connect on that impl. If it is NULL, it will try * all the sock implementations in order and uses the first sock implementation which - * can connect. For example, it may try vpp first, then fall back to posix. + * can connect. * \param opts The sock option pointer provided by the user which should not be NULL pointer. * * \return a pointer to the connected socket on success, or NULL on failure. @@ -189,7 +211,7 @@ struct spdk_sock *spdk_sock_connect_ext(const char *ip, int port, char *impl_nam * \param impl_name The sock_implementation to use, such as "posix". If impl_name is * specified, it will *only* try to listen on that impl. If it is NULL, it will try * all the sock implementations in order and uses the first sock implementation which - * can listen. For example, it may try vpp first, then fall back to posix. + * can listen. * * \return a pointer to the listened socket on success, or NULL on failure. */ @@ -205,7 +227,7 @@ struct spdk_sock *spdk_sock_listen(const char *ip, int port, char *impl_name); * \param impl_name The sock_implementation to use, such as "posix". If impl_name is * specified, it will *only* try to listen on that impl. If it is NULL, it will try * all the sock implementations in order and uses the first sock implementation which - * can listen. For example, it may try vpp first, then fall back to posix. + * can listen. * \param opts The sock option pointer provided by the user, which should not be NULL pointer. * * \return a pointer to the listened socket on success, or NULL on failure. @@ -450,6 +472,13 @@ int spdk_sock_impl_get_opts(const char *impl_name, struct spdk_sock_impl_opts *o int spdk_sock_impl_set_opts(const char *impl_name, const struct spdk_sock_impl_opts *opts, size_t len); +/** + * Write socket subsystem configuration into provided JSON context. + * + * \param w JSON write context + */ +void spdk_sock_write_config_json(struct spdk_json_write_ctx *w); + #ifdef __cplusplus } #endif diff --git a/include/spdk/thread.h b/include/spdk/thread.h index 27c575b31ff..841cf39a824 100644 --- a/include/spdk/thread.h +++ b/include/spdk/thread.h @@ -47,6 +47,11 @@ extern "C" { #endif +enum spdk_thread_poller_rc { + SPDK_POLLER_IDLE, + SPDK_POLLER_BUSY, +}; + /** * A stackless, lightweight thread. */ diff --git a/include/spdk/version.h b/include/spdk/version.h index 689c2a3ce34..86b199e04f5 100644 --- a/include/spdk/version.h +++ b/include/spdk/version.h @@ -46,7 +46,7 @@ /** * Minor version number (month of original release). */ -#define SPDK_VERSION_MINOR 7 +#define SPDK_VERSION_MINOR 10 /** * Patch level. diff --git a/include/spdk_internal/accel_engine.h b/include/spdk_internal/accel_engine.h index 63c82ea3bd5..b77a06addc5 100644 --- a/include/spdk_internal/accel_engine.h +++ b/include/spdk_internal/accel_engine.h @@ -41,21 +41,41 @@ struct spdk_accel_task { spdk_accel_completion_cb cb; + void *cb_arg; + struct accel_io_channel *accel_ch; + TAILQ_ENTRY(spdk_accel_task) link; uint8_t offload_ctx[0]; }; struct spdk_accel_engine { uint64_t (*get_capabilities)(void); - int (*copy)(void *cb_arg, struct spdk_io_channel *ch, void *dst, void *src, - uint64_t nbytes, spdk_accel_completion_cb cb); - int (*dualcast)(void *cb_arg, struct spdk_io_channel *ch, void *dst1, void *dst2, void *src, - uint64_t nbytes, spdk_accel_completion_cb cb); - int (*compare)(void *cb_arg, struct spdk_io_channel *ch, void *src1, void *src2, - uint64_t nbytes, spdk_accel_completion_cb cb); - int (*fill)(void *cb_arg, struct spdk_io_channel *ch, void *dst, uint8_t fill, - uint64_t nbytes, spdk_accel_completion_cb cb); - int (*crc32c)(void *cb_arg, struct spdk_io_channel *ch, uint32_t *dst, void *src, - uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb); + int (*copy)(struct spdk_io_channel *ch, void *dst, void *src, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); + int (*dualcast)(struct spdk_io_channel *ch, void *dst1, void *dst2, void *src, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); + uint32_t (*batch_get_max)(void); + struct spdk_accel_batch *(*batch_create)(struct spdk_io_channel *ch); + int (*batch_prep_copy)(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *dst, void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); + int (*batch_prep_dualcast)(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *dst1, void *dst2, void *src, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg); + int (*batch_prep_compare)(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *src1, void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); + int (*batch_prep_fill)(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *dst, uint8_t fill, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); + int (*batch_prep_crc32c)(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg); + int (*batch_submit)(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + spdk_accel_completion_cb cb_fn, void *cb_arg); + int (*batch_cancel)(struct spdk_io_channel *ch, struct spdk_accel_batch *batch); + int (*compare)(struct spdk_io_channel *ch, void *src1, void *src2, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); + int (*fill)(struct spdk_io_channel *ch, void *dst, uint8_t fill, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); + int (*crc32c)(struct spdk_io_channel *ch, uint32_t *dst, void *src, + uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); struct spdk_io_channel *(*get_io_channel)(void); }; diff --git a/include/spdk_internal/log.h b/include/spdk_internal/log.h index 0993d1016ce..3a14ec65292 100644 --- a/include/spdk_internal/log.h +++ b/include/spdk_internal/log.h @@ -44,7 +44,6 @@ extern enum spdk_log_level g_spdk_log_level; extern enum spdk_log_level g_spdk_log_print_level; -extern enum spdk_log_level g_spdk_log_backtrace_level; struct spdk_log_flag { TAILQ_ENTRY(spdk_log_flag) tailq; diff --git a/include/spdk_internal/nvme_tcp.h b/include/spdk_internal/nvme_tcp.h index 45e8def5154..7065bc060ee 100644 --- a/include/spdk_internal/nvme_tcp.h +++ b/include/spdk_internal/nvme_tcp.h @@ -42,6 +42,7 @@ #define SPDK_NVME_TCP_DIGEST_LEN 4 #define SPDK_NVME_TCP_DIGEST_ALIGNMENT 4 #define SPDK_NVME_TCP_QPAIR_EXIT_TIMEOUT 30 +#define SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR 8 /* * Maximum number of SGL elements. diff --git a/include/spdk_internal/sock.h b/include/spdk_internal/sock.h index d88d6bd03e9..23f46415787 100644 --- a/include/spdk_internal/sock.h +++ b/include/spdk_internal/sock.h @@ -49,6 +49,8 @@ extern "C" { #define MAX_EVENTS_PER_POLL 32 #define DEFAULT_SOCK_PRIORITY 0 #define MIN_SOCK_PIPE_SIZE 1024 +#define MIN_SO_RCVBUF_SIZE (2 * 1024 * 1024) +#define MIN_SO_SNDBUF_SIZE (2 * 1024 * 1024) struct spdk_sock { struct spdk_net_impl *net_impl; @@ -63,6 +65,7 @@ struct spdk_sock { TAILQ_HEAD(, spdk_sock_request) queued_reqs; TAILQ_HEAD(, spdk_sock_request) pending_reqs; int queued_iovcnt; + int placement_id; struct { uint8_t closed : 1; diff --git a/include/spdk_internal/uring.h b/include/spdk_internal/uring.h index ff22f11d42a..86c39945c13 100644 --- a/include/spdk_internal/uring.h +++ b/include/spdk_internal/uring.h @@ -36,16 +36,4 @@ #include -#ifndef __NR_sys_io_uring_enter -#define __NR_sys_io_uring_enter 426 -#endif - -static int -spdk_io_uring_enter(int ring_fd, unsigned int to_submit, - unsigned int min_complete, unsigned int flags) -{ - return syscall(__NR_sys_io_uring_enter, ring_fd, to_submit, - min_complete, flags, NULL, 0); -} - #endif /* SPDK_INTERNAL_URING_H */ diff --git a/isa-l b/isa-l index f3993f5c0b6..806b55ee578 160000 --- a/isa-l +++ b/isa-l @@ -1 +1 @@ -Subproject commit f3993f5c0b69118a229c2dfbb360515cd34a24e6 +Subproject commit 806b55ee578efd8158962b90121a4568eb1ecb66 diff --git a/lib/accel/accel_engine.c b/lib/accel/accel_engine.c index f040f9af456..4682406b00b 100644 --- a/lib/accel/accel_engine.c +++ b/lib/accel/accel_engine.c @@ -48,7 +48,8 @@ * later in this file. */ -#define ALIGN_4K 0x1000 +#define ALIGN_4K 0x1000 +#define MAX_TASKS_PER_CHANNEL 0x400 /* Largest context size for all accel modules */ static size_t g_max_accel_module_size = 0; @@ -66,8 +67,25 @@ static TAILQ_HEAD(, spdk_accel_module_if) spdk_accel_module_list = struct accel_io_channel { struct spdk_accel_engine *engine; struct spdk_io_channel *ch; + void *task_pool_base; + TAILQ_HEAD(, spdk_accel_task) task_pool; }; +/* Forward declarations of software implementations used when an + * engine has not implemented the capability. + */ +static int sw_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2, void *src, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); +static int sw_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); +static int sw_accel_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); +static int sw_accel_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); +static int sw_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src, + uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, + void *cb_arg); + /* Registration of hw modules (currently supports only 1 at a time) */ void spdk_accel_hw_engine_register(struct spdk_accel_engine *accel_engine) @@ -97,9 +115,10 @@ accel_sw_unregister(void) static void _accel_engine_done(void *ref, int status) { - struct spdk_accel_task *req = (struct spdk_accel_task *)ref; + struct spdk_accel_task *accel_task = (struct spdk_accel_task *)ref; - req->cb(req, status); + accel_task->cb(accel_task->cb_arg, status); + TAILQ_INSERT_TAIL(&accel_task->accel_ch->task_pool, accel_task, link); } uint64_t @@ -107,81 +126,300 @@ spdk_accel_get_capabilities(struct spdk_io_channel *ch) { struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + /* All engines are required to implement this API. */ return accel_ch->engine->get_capabilities(); } +inline static struct spdk_accel_task * +_get_task(struct accel_io_channel *accel_ch, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct spdk_accel_task *accel_task = TAILQ_FIRST(&accel_ch->task_pool); + + if (accel_task == NULL) { + return NULL; + } + TAILQ_REMOVE(&accel_ch->task_pool, accel_task, link); + + accel_task->cb = cb_fn; + accel_task->cb_arg = cb_arg; + accel_task->accel_ch = accel_ch; + + return accel_task; +} + /* Accel framework public API for copy function */ int -spdk_accel_submit_copy(struct spdk_accel_task *accel_req, struct spdk_io_channel *ch, - void *dst, void *src, uint64_t nbytes, spdk_accel_completion_cb cb) +spdk_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) { struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_task; + + accel_task = _get_task(accel_ch, cb_fn, cb_arg); + if (accel_task == NULL) { + return -ENOMEM; + } - accel_req->cb = cb; - return accel_ch->engine->copy(accel_req->offload_ctx, accel_ch->ch, dst, src, nbytes, - _accel_engine_done); + /* If the engine does not support it, fallback to the sw implementation. */ + if (accel_ch->engine->copy) { + return accel_ch->engine->copy(accel_ch->ch, dst, src, nbytes, + _accel_engine_done, accel_task->offload_ctx); + } else { + return sw_accel_submit_copy(accel_ch->ch, dst, src, nbytes, + _accel_engine_done, accel_task->offload_ctx); + } } /* Accel framework public API for dual cast copy function */ int -spdk_accel_submit_dualcast(struct spdk_accel_task *accel_req, struct spdk_io_channel *ch, - void *dst1, void *dst2, void *src, uint64_t nbytes, - spdk_accel_completion_cb cb) +spdk_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2, void *src, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) { struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_task; if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) { SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n"); return -EINVAL; } - accel_req->cb = cb; - return accel_ch->engine->dualcast(accel_req->offload_ctx, accel_ch->ch, dst1, dst2, src, nbytes, - _accel_engine_done); + accel_task = _get_task(accel_ch, cb_fn, cb_arg); + if (accel_task == NULL) { + return -ENOMEM; + } + + /* If the engine does not support it, fallback to the sw implementation. */ + if (accel_ch->engine->dualcast) { + return accel_ch->engine->dualcast(accel_ch->ch, dst1, dst2, src, nbytes, + _accel_engine_done, accel_task->offload_ctx); + } else { + return sw_accel_submit_dualcast(accel_ch->ch, dst1, dst2, src, nbytes, + _accel_engine_done, accel_task->offload_ctx); + } } -/* Accel framework public API for compare function */ +/* Accel framework public API for batch_create function. All engines are + * required to implement this API. + */ +struct spdk_accel_batch * +spdk_accel_batch_create(struct spdk_io_channel *ch) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + + return accel_ch->engine->batch_create(accel_ch->ch); +} + +/* Accel framework public API for batch_submit function. All engines are + * required to implement this API. + */ int -spdk_accel_submit_compare(struct spdk_accel_task *accel_req, struct spdk_io_channel *ch, - void *src1, void *src2, uint64_t nbytes, spdk_accel_completion_cb cb) +spdk_accel_batch_submit(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + spdk_accel_completion_cb cb_fn, void *cb_arg) { struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_task; - accel_req->cb = cb; - return accel_ch->engine->compare(accel_req->offload_ctx, accel_ch->ch, src1, src2, nbytes, - _accel_engine_done); + accel_task = _get_task(accel_ch, cb_fn, cb_arg); + if (accel_task == NULL) { + return -ENOMEM; + } + + return accel_ch->engine->batch_submit(accel_ch->ch, batch, _accel_engine_done, + accel_task->offload_ctx); } -/* Accel framework public API for fill function */ +/* Accel framework public API for getting max batch. All engines are + * required to implement this API. + */ +uint32_t +spdk_accel_batch_get_max(struct spdk_io_channel *ch) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + + return accel_ch->engine->batch_get_max(); +} + +/* Accel framework public API for for when an app is unable to complete a batch sequence, + * it cancels with this API. + */ int -spdk_accel_submit_fill(struct spdk_accel_task *accel_req, struct spdk_io_channel *ch, - void *dst, uint8_t fill, uint64_t nbytes, spdk_accel_completion_cb cb) +spdk_accel_batch_cancel(struct spdk_io_channel *ch, struct spdk_accel_batch *batch) { struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); - accel_req->cb = cb; - return accel_ch->engine->fill(accel_req->offload_ctx, accel_ch->ch, dst, fill, nbytes, - _accel_engine_done); + return accel_ch->engine->batch_cancel(accel_ch->ch, batch); } -/* Accel framework public API for CRC-32C function */ +/* Accel framework public API for batch prep_copy function. All engines are + * required to implement this API. + */ +int +spdk_accel_batch_prep_copy(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst, + void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_task; + + accel_task = _get_task(accel_ch, cb_fn, cb_arg); + if (accel_task == NULL) { + return -ENOMEM; + } + + return accel_ch->engine->batch_prep_copy(accel_ch->ch, batch, dst, src, nbytes, + _accel_engine_done, accel_task->offload_ctx); +} + +/* Accel framework public API for batch prep_dualcast function. All engines are + * required to implement this API. + */ +int +spdk_accel_batch_prep_dualcast(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *dst1, void *dst2, void *src, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_task; + + if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) { + SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n"); + return -EINVAL; + } + + accel_task = _get_task(accel_ch, cb_fn, cb_arg); + if (accel_task == NULL) { + return -ENOMEM; + } + + return accel_ch->engine->batch_prep_dualcast(accel_ch->ch, batch, dst1, dst2, src, + nbytes, _accel_engine_done, accel_task->offload_ctx); +} + +/* Accel framework public API for batch prep_compare function. All engines are + * required to implement this API. + */ +int +spdk_accel_batch_prep_compare(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *src1, void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn, + void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_task; + + accel_task = _get_task(accel_ch, cb_fn, cb_arg); + if (accel_task == NULL) { + return -ENOMEM; + } + + return accel_ch->engine->batch_prep_compare(accel_ch->ch, batch, src1, src2, nbytes, + _accel_engine_done, accel_task->offload_ctx); +} + +/* Accel framework public API for batch prep_fill function. All engines are + * required to implement this API. + */ +int +spdk_accel_batch_prep_fill(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst, + uint8_t fill, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_task; + + accel_task = _get_task(accel_ch, cb_fn, cb_arg); + if (accel_task == NULL) { + return -ENOMEM; + } + + return accel_ch->engine->batch_prep_fill(accel_ch->ch, batch, dst, fill, nbytes, + _accel_engine_done, accel_task->offload_ctx); +} + +/* Accel framework public API for batch prep_crc32c function. All engines are + * required to implement this API. + */ +int +spdk_accel_batch_prep_crc32c(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_task; + + accel_task = _get_task(accel_ch, cb_fn, cb_arg); + if (accel_task == NULL) { + return -ENOMEM; + } + + return accel_ch->engine->batch_prep_crc32c(accel_ch->ch, batch, dst, src, seed, nbytes, + _accel_engine_done, accel_task->offload_ctx); +} + +/* Accel framework public API for compare function */ int -spdk_accel_submit_crc32c(struct spdk_accel_task *accel_req, struct spdk_io_channel *ch, - uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb) +spdk_accel_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) { struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_task; - accel_req->cb = cb; - return accel_ch->engine->crc32c(accel_req->offload_ctx, accel_ch->ch, dst, src, - seed, nbytes, _accel_engine_done); + accel_task = _get_task(accel_ch, cb_fn, cb_arg); + if (accel_task == NULL) { + return -ENOMEM; + } + + /* If the engine does not support it, fallback to the sw implementation. */ + if (accel_ch->engine->compare) { + return accel_ch->engine->compare(accel_ch->ch, src1, src2, nbytes, + _accel_engine_done, accel_task->offload_ctx); + } else { + return sw_accel_submit_compare(accel_ch->ch, src1, src2, nbytes, + _accel_engine_done, accel_task->offload_ctx); + } } +/* Accel framework public API for fill function */ +int +spdk_accel_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_task; + + accel_task = _get_task(accel_ch, cb_fn, cb_arg); + if (accel_task == NULL) { + return -ENOMEM; + } + + /* If the engine does not support it, fallback to the sw implementation. */ + if (accel_ch->engine->fill) { + return accel_ch->engine->fill(accel_ch->ch, dst, fill, nbytes, + _accel_engine_done, accel_task->offload_ctx); + } else { + return sw_accel_submit_fill(accel_ch->ch, dst, fill, nbytes, + _accel_engine_done, accel_task->offload_ctx); + } +} -/* Returns the largest context size of the accel modules. */ -size_t -spdk_accel_task_size(void) +/* Accel framework public API for CRC-32C function */ +int +spdk_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src, uint32_t seed, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) { - return g_max_accel_module_size; + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_task; + + accel_task = _get_task(accel_ch, cb_fn, cb_arg); + if (accel_task == NULL) { + return -ENOMEM; + } + + /* If the engine does not support it, fallback to the sw implementation. */ + if (accel_ch->engine->crc32c) { + return accel_ch->engine->crc32c(accel_ch->ch, dst, src, seed, nbytes, + _accel_engine_done, accel_task->offload_ctx); + } else { + return sw_accel_submit_crc32c(accel_ch->ch, dst, src, seed, nbytes, + _accel_engine_done, accel_task->offload_ctx); + } } /* Helper function when when accel modules register with the framework. */ @@ -198,6 +436,22 @@ static int accel_engine_create_cb(void *io_device, void *ctx_buf) { struct accel_io_channel *accel_ch = ctx_buf; + struct spdk_accel_task *accel_task; + uint8_t *task_mem; + int i; + + accel_ch->task_pool_base = calloc(MAX_TASKS_PER_CHANNEL, g_max_accel_module_size); + if (accel_ch->task_pool_base == NULL) { + return -ENOMEM; + } + + TAILQ_INIT(&accel_ch->task_pool); + task_mem = accel_ch->task_pool_base; + for (i = 0 ; i < MAX_TASKS_PER_CHANNEL; i++) { + accel_task = (struct spdk_accel_task *)task_mem; + TAILQ_INSERT_TAIL(&accel_ch->task_pool, accel_task, link); + task_mem += g_max_accel_module_size; + } if (g_hw_accel_engine != NULL) { accel_ch->ch = g_hw_accel_engine->get_io_channel(); @@ -221,6 +475,7 @@ accel_engine_destroy_cb(void *io_device, void *ctx_buf) struct accel_io_channel *accel_ch = ctx_buf; spdk_put_io_channel(accel_ch->ch); + free(accel_ch->task_pool_base); } struct spdk_io_channel * @@ -244,6 +499,7 @@ spdk_accel_engine_initialize(void) { SPDK_NOTICELOG("Accel engine initialized to use software engine.\n"); accel_engine_module_initialize(); + /* * We need a unique identifier for the accel engine framework, so use the * spdk_accel_module_list address for this purpose. @@ -327,88 +583,360 @@ spdk_accel_engine_config_text(FILE *fp) } } -/* The SW Accelerator module is "built in" here (rest of file) */ +/* + * The SW Accelerator module is "built in" here (rest of file) + */ + +#define SW_ACCEL_BATCH_SIZE 2048 + +enum sw_accel_opcode { + SW_ACCEL_OPCODE_MEMMOVE = 0, + SW_ACCEL_OPCODE_MEMFILL = 1, + SW_ACCEL_OPCODE_COMPARE = 2, + SW_ACCEL_OPCODE_CRC32C = 3, + SW_ACCEL_OPCODE_DUALCAST = 4, +}; + +struct sw_accel_op { + struct sw_accel_io_channel *sw_ch; + void *cb_arg; + spdk_accel_completion_cb cb_fn; + void *src; + union { + void *dst; + void *src2; + }; + void *dst2; + uint32_t seed; + uint64_t fill_pattern; + enum sw_accel_opcode op_code; + uint64_t nbytes; + TAILQ_ENTRY(sw_accel_op) link; +}; + +/* The sw accel engine only supports one outstanding batch at a time. */ +struct sw_accel_io_channel { + TAILQ_HEAD(, sw_accel_op) op_pool; + TAILQ_HEAD(, sw_accel_op) batch; +}; static uint64_t sw_accel_get_capabilities(void) { return ACCEL_COPY | ACCEL_FILL | ACCEL_CRC32C | ACCEL_COMPARE | - ACCEL_DUALCAST; + ACCEL_DUALCAST | ACCEL_BATCH; +} + +static uint32_t +sw_accel_batch_get_max(void) +{ + return SW_ACCEL_BATCH_SIZE; +} + +/* The sw engine plug-in does not ahve a public API, it is only callable + * from the accel fw and thus does not need to have its own struct definition + * of a batch, it just simply casts the address of the single supported batch + * as the struct spdk_accel_batch pointer. + */ +static struct spdk_accel_batch * +sw_accel_batch_start(struct spdk_io_channel *ch) +{ + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + if (!TAILQ_EMPTY(&sw_ch->batch)) { + SPDK_ERRLOG("SW accel engine only supports one batch at a time.\n"); + return NULL; + } + + return (struct spdk_accel_batch *)&sw_ch->batch; +} + +static struct sw_accel_op * +_prep_op(struct sw_accel_io_channel *sw_ch, struct spdk_accel_batch *batch, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct sw_accel_op *op; + + if ((struct spdk_accel_batch *)&sw_ch->batch != batch) { + SPDK_ERRLOG("Invalid batch\n"); + return NULL; + } + + if (!TAILQ_EMPTY(&sw_ch->op_pool)) { + op = TAILQ_FIRST(&sw_ch->op_pool); + TAILQ_REMOVE(&sw_ch->op_pool, op, link); + } else { + SPDK_ERRLOG("Ran out of operations for batch\n"); + return NULL; + } + + op->cb_arg = cb_arg; + op->cb_fn = cb_fn; + op->sw_ch = sw_ch; + + return op; +} + +static int +sw_accel_batch_prep_copy(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *dst, void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + op = _prep_op(sw_ch, batch, cb_fn, cb_arg); + if (op == NULL) { + return -EINVAL; + } + + /* Command specific. */ + op->src = src; + op->dst = dst; + op->nbytes = nbytes; + op->op_code = SW_ACCEL_OPCODE_MEMMOVE; + TAILQ_INSERT_TAIL(&sw_ch->batch, op, link); + + return 0; +} + +static int +sw_accel_batch_prep_dualcast(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst1, + void *dst2, + void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + op = _prep_op(sw_ch, batch, cb_fn, cb_arg); + if (op == NULL) { + return -EINVAL; + } + + /* Command specific. */ + op->src = src; + op->dst = dst1; + op->dst2 = dst2; + op->nbytes = nbytes; + op->op_code = SW_ACCEL_OPCODE_DUALCAST; + TAILQ_INSERT_TAIL(&sw_ch->batch, op, link); + + return 0; } static int -sw_accel_submit_copy(void *cb_arg, struct spdk_io_channel *ch, void *dst, void *src, - uint64_t nbytes, - spdk_accel_completion_cb cb) +sw_accel_batch_prep_compare(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *src1, + void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) { - struct spdk_accel_task *accel_req; + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + op = _prep_op(sw_ch, batch, cb_fn, cb_arg); + if (op == NULL) { + return -EINVAL; + } + + /* Command specific. */ + op->src = src1; + op->src2 = src2; + op->nbytes = nbytes; + op->op_code = SW_ACCEL_OPCODE_COMPARE; + TAILQ_INSERT_TAIL(&sw_ch->batch, op, link); + + return 0; +} + +static int +sw_accel_batch_prep_fill(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst, + uint8_t fill, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + op = _prep_op(sw_ch, batch, cb_fn, cb_arg); + if (op == NULL) { + return -EINVAL; + } + + /* Command specific. */ + op->dst = dst; + op->fill_pattern = fill; + op->nbytes = nbytes; + op->op_code = SW_ACCEL_OPCODE_MEMFILL; + TAILQ_INSERT_TAIL(&sw_ch->batch, op, link); + + return 0; +} + +static int +sw_accel_batch_prep_crc32c(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + uint32_t *dst, + void *src, uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + op = _prep_op(sw_ch, batch, cb_fn, cb_arg); + if (op == NULL) { + return -EINVAL; + } + + /* Command specific. */ + op->dst = (void *)dst; + op->src = src; + op->seed = seed; + op->nbytes = nbytes; + op->op_code = SW_ACCEL_OPCODE_CRC32C; + TAILQ_INSERT_TAIL(&sw_ch->batch, op, link); + + return 0; +} + + +static int +sw_accel_batch_cancel(struct spdk_io_channel *ch, struct spdk_accel_batch *batch) +{ + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + if ((struct spdk_accel_batch *)&sw_ch->batch != batch) { + SPDK_ERRLOG("Invalid batch\n"); + return -EINVAL; + } + + /* Cancel the batch items by moving them back to the op_pool. */ + while ((op = TAILQ_FIRST(&sw_ch->batch))) { + TAILQ_REMOVE(&sw_ch->batch, op, link); + TAILQ_INSERT_TAIL(&sw_ch->op_pool, op, link); + } + + return 0; +} + +static int +sw_accel_batch_submit(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_task; + int batch_status = 0, cmd_status = 0; + + if ((struct spdk_accel_batch *)&sw_ch->batch != batch) { + SPDK_ERRLOG("Invalid batch\n"); + return -EINVAL; + } + + /* Complete the batch items. */ + while ((op = TAILQ_FIRST(&sw_ch->batch))) { + TAILQ_REMOVE(&sw_ch->batch, op, link); + accel_task = (struct spdk_accel_task *)((uintptr_t)op->cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + + switch (op->op_code) { + case SW_ACCEL_OPCODE_MEMMOVE: + memcpy(op->dst, op->src, op->nbytes); + break; + case SW_ACCEL_OPCODE_DUALCAST: + memcpy(op->dst, op->src, op->nbytes); + memcpy(op->dst2, op->src, op->nbytes); + break; + case SW_ACCEL_OPCODE_COMPARE: + cmd_status = memcmp(op->src, op->src2, op->nbytes); + break; + case SW_ACCEL_OPCODE_MEMFILL: + memset(op->dst, op->fill_pattern, op->nbytes); + break; + case SW_ACCEL_OPCODE_CRC32C: + *(uint32_t *)op->dst = spdk_crc32c_update(op->src, op->nbytes, ~op->seed); + break; + default: + assert(false); + break; + } + + batch_status |= cmd_status; + op->cb_fn(accel_task, cmd_status); + TAILQ_INSERT_TAIL(&sw_ch->op_pool, op, link); + } + + /* Now complete the batch request itself. */ + accel_task = (struct spdk_accel_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + cb_fn(accel_task, batch_status); + + return 0; +} + +static int +sw_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct spdk_accel_task *accel_task; memcpy(dst, src, (size_t)nbytes); - accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg - - offsetof(struct spdk_accel_task, offload_ctx)); - cb(accel_req, 0); + accel_task = (struct spdk_accel_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + cb_fn(accel_task, 0); return 0; } static int -sw_accel_submit_dualcast(void *cb_arg, struct spdk_io_channel *ch, void *dst1, void *dst2, - void *src, uint64_t nbytes, spdk_accel_completion_cb cb) +sw_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2, + void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) { - struct spdk_accel_task *accel_req; + struct spdk_accel_task *accel_task; memcpy(dst1, src, (size_t)nbytes); memcpy(dst2, src, (size_t)nbytes); - accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg - - offsetof(struct spdk_accel_task, offload_ctx)); - cb(accel_req, 0); + accel_task = (struct spdk_accel_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + cb_fn(accel_task, 0); return 0; } static int -sw_accel_submit_compare(void *cb_arg, struct spdk_io_channel *ch, void *src1, void *src2, - uint64_t nbytes, - spdk_accel_completion_cb cb) +sw_accel_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) { - struct spdk_accel_task *accel_req; + struct spdk_accel_task *accel_task; int result; result = memcmp(src1, src2, (size_t)nbytes); - accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg - - offsetof(struct spdk_accel_task, offload_ctx)); - cb(accel_req, result); + accel_task = (struct spdk_accel_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + cb_fn(accel_task, result); return 0; } static int -sw_accel_submit_fill(void *cb_arg, struct spdk_io_channel *ch, void *dst, uint8_t fill, - uint64_t nbytes, - spdk_accel_completion_cb cb) +sw_accel_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) { - struct spdk_accel_task *accel_req; + struct spdk_accel_task *accel_task; memset(dst, fill, nbytes); - accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg - - offsetof(struct spdk_accel_task, offload_ctx)); - cb(accel_req, 0); + accel_task = (struct spdk_accel_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + cb_fn(accel_task, 0); return 0; } static int -sw_accel_submit_crc32c(void *cb_arg, struct spdk_io_channel *ch, uint32_t *dst, void *src, - uint32_t seed, uint64_t nbytes, - spdk_accel_completion_cb cb) +sw_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src, + uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) { - struct spdk_accel_task *accel_req; + struct spdk_accel_task *accel_task; *dst = spdk_crc32c_update(src, nbytes, ~seed); - accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg - - offsetof(struct spdk_accel_task, offload_ctx)); - cb(accel_req, 0); + accel_task = (struct spdk_accel_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + cb_fn(accel_task, 0); return 0; } @@ -419,6 +947,15 @@ static struct spdk_accel_engine sw_accel_engine = { .get_capabilities = sw_accel_get_capabilities, .copy = sw_accel_submit_copy, .dualcast = sw_accel_submit_dualcast, + .batch_get_max = sw_accel_batch_get_max, + .batch_create = sw_accel_batch_start, + .batch_cancel = sw_accel_batch_cancel, + .batch_prep_copy = sw_accel_batch_prep_copy, + .batch_prep_dualcast = sw_accel_batch_prep_dualcast, + .batch_prep_compare = sw_accel_batch_prep_compare, + .batch_prep_fill = sw_accel_batch_prep_fill, + .batch_prep_crc32c = sw_accel_batch_prep_crc32c, + .batch_submit = sw_accel_batch_submit, .compare = sw_accel_submit_compare, .fill = sw_accel_submit_fill, .crc32c = sw_accel_submit_crc32c, @@ -428,12 +965,39 @@ static struct spdk_accel_engine sw_accel_engine = { static int sw_accel_create_cb(void *io_device, void *ctx_buf) { + struct sw_accel_io_channel *sw_ch = ctx_buf; + struct sw_accel_op *op; + int i; + + TAILQ_INIT(&sw_ch->batch); + + TAILQ_INIT(&sw_ch->op_pool); + for (i = 0 ; i < SW_ACCEL_BATCH_SIZE ; i++) { + op = calloc(1, sizeof(struct sw_accel_op)); + if (op == NULL) { + SPDK_ERRLOG("Failed to allocate operation for batch.\n"); + while ((op = TAILQ_FIRST(&sw_ch->op_pool))) { + TAILQ_REMOVE(&sw_ch->op_pool, op, link); + free(op); + } + return -ENOMEM; + } + TAILQ_INSERT_TAIL(&sw_ch->op_pool, op, link); + } + return 0; } static void sw_accel_destroy_cb(void *io_device, void *ctx_buf) { + struct sw_accel_io_channel *sw_ch = ctx_buf; + struct sw_accel_op *op; + + while ((op = TAILQ_FIRST(&sw_ch->op_pool))) { + TAILQ_REMOVE(&sw_ch->op_pool, op, link); + free(op); + } } static struct spdk_io_channel *sw_accel_get_io_channel(void) @@ -451,8 +1015,8 @@ static int sw_accel_engine_init(void) { accel_sw_register(&sw_accel_engine); - spdk_io_device_register(&sw_accel_engine, sw_accel_create_cb, sw_accel_destroy_cb, 0, - "sw_accel_engine"); + spdk_io_device_register(&sw_accel_engine, sw_accel_create_cb, sw_accel_destroy_cb, + sizeof(struct sw_accel_io_channel), "sw_accel_engine"); return 0; } diff --git a/lib/accel/spdk_accel.map b/lib/accel/spdk_accel.map index abab7a7fc9f..bfccf0a9000 100644 --- a/lib/accel/spdk_accel.map +++ b/lib/accel/spdk_accel.map @@ -8,12 +8,20 @@ spdk_accel_engine_module_finish; spdk_accel_engine_get_io_channel; spdk_accel_get_capabilities; + spdk_accel_batch_get_max; + spdk_accel_batch_create; + spdk_accel_batch_prep_copy; + spdk_accel_batch_prep_dualcast; + spdk_accel_batch_prep_compare; + spdk_accel_batch_prep_fill; + spdk_accel_batch_prep_crc32c; + spdk_accel_batch_submit; + spdk_accel_batch_cancel; spdk_accel_submit_copy; spdk_accel_submit_dualcast; spdk_accel_submit_compare; spdk_accel_submit_fill; spdk_accel_submit_crc32c; - spdk_accel_task_size; spdk_accel_write_config_json; # functions needed by modules diff --git a/lib/bdev/Makefile b/lib/bdev/Makefile index ca0bf992ad2..497e4bb2774 100644 --- a/lib/bdev/Makefile +++ b/lib/bdev/Makefile @@ -35,7 +35,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk SO_VER := 3 -SO_MINOR := 0 +SO_MINOR := 1 ifeq ($(CONFIG_VTUNE),y) CFLAGS += -I$(CONFIG_VTUNE_DIR)/include -I$(CONFIG_VTUNE_DIR)/sdk/src/ittnotify diff --git a/lib/bdev/bdev.c b/lib/bdev/bdev.c index ec4dd3f2df5..ed55c164bd0 100644 --- a/lib/bdev/bdev.c +++ b/lib/bdev/bdev.c @@ -403,12 +403,52 @@ spdk_bdev_set_opts(struct spdk_bdev_opts *opts) return 0; } -/* - * Will implement the whitelist in the furture - */ +struct spdk_bdev_examine_item { + char *name; + TAILQ_ENTRY(spdk_bdev_examine_item) link; +}; + +TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); + +struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( + g_bdev_examine_allowlist); + +static inline bool +bdev_examine_allowlist_check(const char *name) +{ + struct spdk_bdev_examine_item *item; + TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { + if (strcmp(name, item->name) == 0) { + return true; + } + } + return false; +} + +static inline void +bdev_examine_allowlist_free(void) +{ + struct spdk_bdev_examine_item *item; + while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { + item = TAILQ_FIRST(&g_bdev_examine_allowlist); + TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); + free(item->name); + free(item); + } +} + static inline bool -bdev_in_examine_whitelist(struct spdk_bdev *bdev) +bdev_in_examine_allowlist(struct spdk_bdev *bdev) { + struct spdk_bdev_alias *tmp; + if (bdev_examine_allowlist_check(bdev->name)) { + return true; + } + TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { + if (bdev_examine_allowlist_check(tmp->alias)) { + return true; + } + } return false; } @@ -418,7 +458,89 @@ bdev_ok_to_examine(struct spdk_bdev *bdev) if (g_bdev_opts.bdev_auto_examine) { return true; } else { - return bdev_in_examine_whitelist(bdev); + return bdev_in_examine_allowlist(bdev); + } +} + +static void +bdev_examine(struct spdk_bdev *bdev) +{ + struct spdk_bdev_module *module; + uint32_t action; + + TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (module->examine_config && bdev_ok_to_examine(bdev)) { + action = module->internal.action_in_progress; + module->internal.action_in_progress++; + module->examine_config(bdev); + if (action != module->internal.action_in_progress) { + SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", + module->name); + } + } + } + + if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { + if (bdev->internal.claim_module->examine_disk) { + bdev->internal.claim_module->internal.action_in_progress++; + bdev->internal.claim_module->examine_disk(bdev); + } + return; + } + + TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (module->examine_disk && bdev_ok_to_examine(bdev)) { + module->internal.action_in_progress++; + module->examine_disk(bdev); + } + } +} + +int +spdk_bdev_examine(const char *name) +{ + struct spdk_bdev *bdev; + struct spdk_bdev_examine_item *item; + + if (g_bdev_opts.bdev_auto_examine) { + SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); + return -EINVAL; + } + + if (bdev_examine_allowlist_check(name)) { + SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); + return -EEXIST; + } + + item = calloc(1, sizeof(*item)); + if (!item) { + return -ENOMEM; + } + item->name = strdup(name); + if (!item->name) { + free(item); + return -ENOMEM; + } + TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); + + bdev = spdk_bdev_get_by_name(name); + if (bdev) { + bdev_examine(bdev); + } + return 0; +} + +static inline void +bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_bdev_examine_item *item; + TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "bdev_examine"); + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", item->name); + spdk_json_write_object_end(w); + spdk_json_write_object_end(w); } } @@ -922,6 +1044,8 @@ spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) spdk_json_write_object_end(w); spdk_json_write_object_end(w); + bdev_examine_allowlist_config_json(w); + TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { if (bdev_module->config_json) { bdev_module->config_json(w); @@ -1267,6 +1391,8 @@ bdev_mgr_unregister_cb(void *io_device) spdk_free(g_bdev_mgr.zero_buffer); + bdev_examine_allowlist_free(); + cb_fn(g_fini_cb_arg); g_fini_cb_fn = NULL; g_fini_cb_arg = NULL; @@ -2265,7 +2391,7 @@ bdev_channel_poll_qos(void *arg) * timeslice has actually expired. This should never happen * with a well-behaved timer implementation. */ - return 0; + return SPDK_POLLER_IDLE; } /* Reset for next round of rate limiting */ @@ -2457,7 +2583,7 @@ bdev_poll_timeout_io(void *arg) ctx = calloc(1, sizeof(struct poll_timeout_ctx)); if (!ctx) { SPDK_ERRLOG("failed to allocate memory\n"); - return 1; + return SPDK_POLLER_BUSY; } ctx->desc = desc; ctx->cb_arg = desc->cb_arg; @@ -2476,7 +2602,7 @@ bdev_poll_timeout_io(void *arg) ctx, bdev_channel_poll_timeout_io_done); - return 1; + return SPDK_POLLER_BUSY; } int @@ -3114,7 +3240,7 @@ bdev_calculate_measured_queue_depth(void *ctx) bdev->internal.temporary_queue_depth = 0; spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); - return 0; + return SPDK_POLLER_BUSY; } void @@ -3844,6 +3970,7 @@ bdev_comparev_and_writev_blocks_locked(void *ctx, int status) if (status) { bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + return; } bdev_compare_and_write_do_compare(bdev_io); @@ -5309,39 +5436,11 @@ bdev_fini(struct spdk_bdev *bdev) static void bdev_start(struct spdk_bdev *bdev) { - struct spdk_bdev_module *module; - uint32_t action; - SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); /* Examine configuration before initializing I/O */ - TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { - if (module->examine_config && bdev_ok_to_examine(bdev)) { - action = module->internal.action_in_progress; - module->internal.action_in_progress++; - module->examine_config(bdev); - if (action != module->internal.action_in_progress) { - SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", - module->name); - } - } - } - - if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { - if (bdev->internal.claim_module->examine_disk) { - bdev->internal.claim_module->internal.action_in_progress++; - bdev->internal.claim_module->examine_disk(bdev); - } - return; - } - - TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { - if (module->examine_disk && bdev_ok_to_examine(bdev)) { - module->internal.action_in_progress++; - module->examine_disk(bdev); - } - } + bdev_examine(bdev); } int @@ -6458,12 +6557,12 @@ bdev_lock_lba_range_check_io(void *_i) TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { if (bdev_io_range_is_locked(bdev_io, range)) { ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); - return 1; + return SPDK_POLLER_BUSY; } } spdk_for_each_channel_continue(i, 0); - return 1; + return SPDK_POLLER_BUSY; } static void diff --git a/lib/bdev/bdev_rpc.c b/lib/bdev/bdev_rpc.c index 6ce7136c4f4..5c8cfa469d3 100644 --- a/lib/bdev/bdev_rpc.c +++ b/lib/bdev/bdev_rpc.c @@ -96,3 +96,48 @@ rpc_bdev_set_options(struct spdk_jsonrpc_request *request, const struct spdk_jso } SPDK_RPC_REGISTER("bdev_set_options", rpc_bdev_set_options, SPDK_RPC_STARTUP) SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_set_options, set_bdev_options) + +struct rpc_bdev_examine { + char *name; +}; + +static void +free_rpc_bdev_examine(struct rpc_bdev_examine *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_examine_bdev_decoders[] = { + {"name", offsetof(struct rpc_bdev_examine, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_examine_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_examine req = {NULL}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_examine_bdev_decoders, + SPDK_COUNTOF(rpc_examine_bdev_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "spdk_json_decode_object failed"); + goto cleanup; + } + + rc = spdk_bdev_examine(req.name); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_bdev_examine(&req); +} +SPDK_RPC_REGISTER("bdev_examine", rpc_bdev_examine_bdev, SPDK_RPC_RUNTIME) diff --git a/lib/bdev/part.c b/lib/bdev/part.c index 01a39559106..1f90ce49230 100644 --- a/lib/bdev/part.c +++ b/lib/bdev/part.c @@ -484,12 +484,13 @@ spdk_bdev_part_construct(struct spdk_bdev_part *part, struct spdk_bdev_part_base part->internal.bdev.dif_check_flags = base->bdev->dif_check_flags; part->internal.bdev.name = strdup(name); - part->internal.bdev.product_name = strdup(product_name); - if (part->internal.bdev.name == NULL) { SPDK_ERRLOG("Failed to allocate name for new part of bdev %s\n", spdk_bdev_get_name(base->bdev)); return -1; - } else if (part->internal.bdev.product_name == NULL) { + } + + part->internal.bdev.product_name = strdup(product_name); + if (part->internal.bdev.product_name == NULL) { free(part->internal.bdev.name); SPDK_ERRLOG("Failed to allocate product name for new part of bdev %s\n", spdk_bdev_get_name(base->bdev)); diff --git a/lib/bdev/spdk_bdev.map b/lib/bdev/spdk_bdev.map index 9f9c3c7e5f3..80c3bb555a0 100644 --- a/lib/bdev/spdk_bdev.map +++ b/lib/bdev/spdk_bdev.map @@ -4,6 +4,7 @@ # Public functions in bdev.h spdk_bdev_get_opts; spdk_bdev_set_opts; + spdk_bdev_examine; spdk_bdev_initialize; spdk_bdev_finish; spdk_bdev_config_text; diff --git a/lib/blob/blobstore.c b/lib/blob/blobstore.c index 8dfdfd2477c..370208697d8 100644 --- a/lib/blob/blobstore.c +++ b/lib/blob/blobstore.c @@ -39,6 +39,7 @@ #include "spdk/queue.h" #include "spdk/thread.h" #include "spdk/bit_array.h" +#include "spdk/bit_pool.h" #include "spdk/likely.h" #include "spdk/util.h" #include "spdk/string.h" @@ -105,17 +106,33 @@ bs_release_md_page(struct spdk_blob_store *bs, uint32_t page) spdk_bit_array_clear(bs->used_md_pages, page); } -static void -bs_claim_cluster(struct spdk_blob_store *bs, uint32_t cluster_num) +static uint32_t +bs_claim_cluster(struct spdk_blob_store *bs) { - assert(cluster_num < spdk_bit_array_capacity(bs->used_clusters)); - assert(spdk_bit_array_get(bs->used_clusters, cluster_num) == false); - assert(bs->num_free_clusters > 0); + uint32_t cluster_num; - SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %u\n", cluster_num); + cluster_num = spdk_bit_pool_allocate_bit(bs->used_clusters); + if (cluster_num == UINT32_MAX) { + return UINT32_MAX; + } - spdk_bit_array_set(bs->used_clusters, cluster_num); + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %u\n", cluster_num); bs->num_free_clusters--; + + return cluster_num; +} + +static void +bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num) +{ + assert(cluster_num < spdk_bit_pool_capacity(bs->used_clusters)); + assert(spdk_bit_pool_is_allocated(bs->used_clusters, cluster_num) == true); + assert(bs->num_free_clusters < bs->total_clusters); + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Releasing cluster %u\n", cluster_num); + + spdk_bit_pool_free_bit(bs->used_clusters, cluster_num); + bs->num_free_clusters++; } static int @@ -135,16 +152,13 @@ blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t clust static int bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num, - uint64_t *lowest_free_cluster, uint32_t *lowest_free_md_page, bool update_map) + uint64_t *cluster, uint32_t *lowest_free_md_page, bool update_map) { uint32_t *extent_page = 0; - pthread_mutex_lock(&blob->bs->used_clusters_mutex); - *lowest_free_cluster = spdk_bit_array_find_first_clear(blob->bs->used_clusters, - *lowest_free_cluster); - if (*lowest_free_cluster == UINT32_MAX) { + *cluster = bs_claim_cluster(blob->bs); + if (*cluster == UINT32_MAX) { /* No more free clusters. Cannot satisfy the request */ - pthread_mutex_unlock(&blob->bs->used_clusters_mutex); return -ENOSPC; } @@ -156,20 +170,17 @@ bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num, *lowest_free_md_page); if (*lowest_free_md_page == UINT32_MAX) { /* No more free md pages. Cannot satisfy the request */ - pthread_mutex_unlock(&blob->bs->used_clusters_mutex); + bs_release_cluster(blob->bs, *cluster); return -ENOSPC; } bs_claim_md_page(blob->bs, *lowest_free_md_page); } } - SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %lu for blob %lu\n", *lowest_free_cluster, blob->id); - bs_claim_cluster(blob->bs, *lowest_free_cluster); - - pthread_mutex_unlock(&blob->bs->used_clusters_mutex); + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %lu for blob %lu\n", *cluster, blob->id); if (update_map) { - blob_insert_cluster(blob, cluster_num, *lowest_free_cluster); + blob_insert_cluster(blob, cluster_num, *cluster); if (blob->use_extent_table && *extent_page == 0) { *extent_page = *lowest_free_md_page; } @@ -178,21 +189,6 @@ bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num, return 0; } -static void -bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num) -{ - assert(cluster_num < spdk_bit_array_capacity(bs->used_clusters)); - assert(spdk_bit_array_get(bs->used_clusters, cluster_num) == true); - assert(bs->num_free_clusters < bs->total_clusters); - - SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Releasing cluster %u\n", cluster_num); - - pthread_mutex_lock(&bs->used_clusters_mutex); - spdk_bit_array_clear(bs->used_clusters, cluster_num); - bs->num_free_clusters++; - pthread_mutex_unlock(&bs->used_clusters_mutex); -} - static void blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs) { @@ -564,8 +560,8 @@ blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob) for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { for (j = 0; j < desc_extent_rle->extents[i].length; j++) { if (desc_extent_rle->extents[i].cluster_idx != 0) { - if (!spdk_bit_array_get(blob->bs->used_clusters, - desc_extent_rle->extents[i].cluster_idx + j)) { + if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, + desc_extent_rle->extents[i].cluster_idx + j)) { return -EINVAL; } } @@ -673,7 +669,7 @@ blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob) for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { if (desc_extent->cluster_idx[i] != 0) { - if (!spdk_bit_array_get(blob->bs->used_clusters, desc_extent->cluster_idx[i])) { + if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, desc_extent->cluster_idx[i])) { return -EINVAL; } } @@ -815,14 +811,12 @@ blob_serialize_add_page(const struct spdk_blob *blob, if (*page_count == 0) { assert(*pages == NULL); *page_count = 1; - *pages = spdk_malloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE, + *pages = spdk_malloc(SPDK_BS_PAGE_SIZE, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); } else { assert(*pages != NULL); (*page_count)++; - *pages = spdk_realloc(*pages, - SPDK_BS_PAGE_SIZE * (*page_count), - SPDK_BS_PAGE_SIZE); + *pages = spdk_realloc(*pages, SPDK_BS_PAGE_SIZE * (*page_count), 0); } if (*pages == NULL) { @@ -1238,9 +1232,8 @@ blob_md_page_calc_crc(void *page) } static void -blob_load_final(void *cb_arg, int bserrno) +blob_load_final(struct spdk_blob_load_ctx *ctx, int bserrno) { - struct spdk_blob_load_ctx *ctx = cb_arg; struct spdk_blob *blob = ctx->blob; if (bserrno == 0) { @@ -1327,8 +1320,8 @@ blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) if (ctx->pages == NULL) { /* First iteration of this function, allocate buffer for single EXTENT_PAGE */ - ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE, NULL, SPDK_ENV_SOCKET_ID_ANY, - SPDK_MALLOC_DMA); + ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, + NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); if (!ctx->pages) { blob_load_final(ctx, -ENOMEM); return; @@ -1431,8 +1424,7 @@ blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) /* Read the next page */ ctx->num_pages++; - ctx->pages = spdk_realloc(ctx->pages, (sizeof(*page) * ctx->num_pages), - sizeof(*page)); + ctx->pages = spdk_realloc(ctx->pages, (sizeof(*page) * ctx->num_pages), 0); if (ctx->pages == NULL) { blob_load_final(ctx, -ENOMEM); return; @@ -1500,7 +1492,7 @@ blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob, } ctx->blob = blob; - ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE); + ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, 0); if (!ctx->pages) { free(ctx); cb_fn(seq, cb_arg, -ENOMEM); @@ -1557,9 +1549,8 @@ bs_batch_clear_dev(struct spdk_blob_persist_ctx *ctx, spdk_bs_batch_t *batch, ui static void blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx); static void -blob_persist_complete(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +blob_persist_complete(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx, int bserrno) { - struct spdk_blob_persist_ctx *ctx = cb_arg; struct spdk_blob_persist_ctx *next_persist; struct spdk_blob *blob = ctx->blob; @@ -1584,6 +1575,71 @@ blob_persist_complete(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) } } +static void +blob_persist_clear_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + size_t i; + + if (bserrno != 0) { + blob_persist_complete(seq, ctx, bserrno); + return; + } + + /* Release all extent_pages that were truncated */ + for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { + /* Nothing to release if it was not allocated */ + if (blob->active.extent_pages[i] != 0) { + bs_release_md_page(bs, blob->active.extent_pages[i]); + } + } + + if (blob->active.num_extent_pages == 0) { + free(blob->active.extent_pages); + blob->active.extent_pages = NULL; + blob->active.extent_pages_array_size = 0; + } else if (blob->active.num_extent_pages != blob->active.extent_pages_array_size) { +#ifndef __clang_analyzer__ + void *tmp; + + /* scan-build really can't figure reallocs, workaround it */ + tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages); + assert(tmp != NULL); + blob->active.extent_pages = tmp; +#endif + blob->active.extent_pages_array_size = blob->active.num_extent_pages; + } + + blob_persist_complete(seq, ctx, bserrno); +} + +static void +blob_persist_clear_extents(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) +{ + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + size_t i; + uint64_t lba; + uint32_t lba_count; + spdk_bs_batch_t *batch; + + batch = bs_sequence_to_batch(seq, blob_persist_clear_extents_cpl, ctx); + lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); + + /* Clear all extent_pages that were truncated */ + for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { + /* Nothing to clear if it was not allocated */ + if (blob->active.extent_pages[i] != 0) { + lba = bs_md_page_to_lba(bs, blob->clean.extent_pages[i]); + bs_batch_write_zeroes_dev(batch, lba, lba_count); + } + } + + bs_batch_close(batch); +} + static void blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) { @@ -1597,6 +1653,7 @@ blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserr return; } + pthread_mutex_lock(&bs->used_clusters_mutex); /* Release all clusters that were truncated */ for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { uint32_t cluster_num = bs_lba_to_cluster(bs, blob->active.clusters[i]); @@ -1606,6 +1663,7 @@ blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserr bs_release_cluster(bs, cluster_num); } } + pthread_mutex_unlock(&bs->used_clusters_mutex); if (blob->active.num_clusters == 0) { free(blob->active.clusters); @@ -1620,22 +1678,17 @@ blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserr assert(tmp != NULL); blob->active.clusters = tmp; - tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages); - assert(tmp != NULL); - blob->active.extent_pages = tmp; #endif - blob->active.extent_pages_array_size = blob->active.num_extent_pages; blob->active.cluster_array_size = blob->active.num_clusters; } - /* TODO: Add path to persist clear extent pages. */ - blob_persist_complete(seq, ctx, bserrno); + /* Move on to clearing extent pages */ + blob_persist_clear_extents(seq, ctx); } static void -blob_persist_clear_clusters(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +blob_persist_clear_clusters(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) { - struct spdk_blob_persist_ctx *ctx = cb_arg; struct spdk_blob *blob = ctx->blob; struct spdk_blob_store *bs = blob->bs; spdk_bs_batch_t *batch; @@ -1643,11 +1696,6 @@ blob_persist_clear_clusters(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) uint64_t lba; uint32_t lba_count; - if (bserrno != 0) { - blob_persist_complete(seq, ctx, bserrno); - return; - } - /* Clusters don't move around in blobs. The list shrinks or grows * at the end, but no changes ever occur in the middle of the list. */ @@ -1720,7 +1768,7 @@ blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) } /* Move on to clearing clusters */ - blob_persist_clear_clusters(seq, ctx, 0); + blob_persist_clear_clusters(seq, ctx); } static void @@ -1799,9 +1847,8 @@ blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) } static void -blob_persist_write_page_chain(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +blob_persist_write_page_chain(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) { - struct spdk_blob_persist_ctx *ctx = cb_arg; struct spdk_blob *blob = ctx->blob; struct spdk_blob_store *bs = blob->bs; uint64_t lba; @@ -1810,11 +1857,6 @@ blob_persist_write_page_chain(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno spdk_bs_batch_t *batch; size_t i; - if (bserrno != 0) { - blob_persist_complete(seq, ctx, bserrno); - return; - } - /* Clusters don't move around in blobs. The list shrinks or grows * at the end, but no changes ever occur in the middle of the list. */ @@ -1843,7 +1885,7 @@ blob_resize(struct spdk_blob *blob, uint64_t sz) { uint64_t i; uint64_t *tmp; - uint64_t lfc; /* lowest free cluster */ + uint64_t cluster; uint32_t lfmd; /* lowest free md page */ uint64_t num_clusters; uint32_t *ep_tmp; @@ -1876,19 +1918,10 @@ blob_resize(struct spdk_blob *blob, uint64_t sz) current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP); } - /* Do two passes - one to verify that we can obtain enough clusters - * and md pages, another to actually claim them. - */ - - if (spdk_blob_is_thin_provisioned(blob) == false) { - lfc = 0; - for (i = num_clusters; i < sz; i++) { - lfc = spdk_bit_array_find_first_clear(bs->used_clusters, lfc); - if (lfc == UINT32_MAX) { - /* No more free clusters. Cannot satisfy the request */ - return -ENOSPC; - } - lfc++; + /* Check first that we have enough clusters and md pages before we start claiming them. */ + if (sz > num_clusters && spdk_blob_is_thin_provisioned(blob) == false) { + if ((sz - num_clusters) > bs->num_free_clusters) { + return -ENOSPC; } lfmd = 0; for (i = current_num_ep; i < new_num_ep ; i++) { @@ -1929,13 +1962,14 @@ blob_resize(struct spdk_blob *blob, uint64_t sz) blob->state = SPDK_BLOB_STATE_DIRTY; if (spdk_blob_is_thin_provisioned(blob) == false) { - lfc = 0; + cluster = 0; lfmd = 0; + pthread_mutex_lock(&blob->bs->used_clusters_mutex); for (i = num_clusters; i < sz; i++) { - bs_allocate_cluster(blob, i, &lfc, &lfmd, true); - lfc++; + bs_allocate_cluster(blob, i, &cluster, &lfmd, true); lfmd++; } + pthread_mutex_unlock(&blob->bs->used_clusters_mutex); } blob->active.num_clusters = sz; @@ -2001,7 +2035,7 @@ blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx) ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); /* Start writing the metadata from last page to first */ blob->state = SPDK_BLOB_STATE_CLEAN; - blob_persist_write_page_chain(seq, ctx, 0); + blob_persist_write_page_chain(seq, ctx); } static void @@ -2220,7 +2254,9 @@ blob_insert_cluster_cpl(void *cb_arg, int bserrno) * but continue without error. */ bserrno = 0; } + pthread_mutex_lock(&ctx->blob->bs->used_clusters_mutex); bs_release_cluster(ctx->blob->bs, ctx->new_cluster); + pthread_mutex_unlock(&ctx->blob->bs->used_clusters_mutex); if (ctx->new_extent_page != 0) { bs_release_md_page(ctx->blob->bs, ctx->new_extent_page); } @@ -2317,8 +2353,10 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob, } } + pthread_mutex_lock(&blob->bs->used_clusters_mutex); rc = bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page, false); + pthread_mutex_unlock(&blob->bs->used_clusters_mutex); if (rc != 0) { spdk_free(ctx->buf); free(ctx); @@ -2332,7 +2370,9 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob, ctx->seq = bs_sequence_start(_ch, &cpl); if (!ctx->seq) { + pthread_mutex_lock(&blob->bs->used_clusters_mutex); bs_release_cluster(blob->bs, ctx->new_cluster); + pthread_mutex_unlock(&blob->bs->used_clusters_mutex); spdk_free(ctx->buf); free(ctx); bs_user_op_abort(op); @@ -2354,7 +2394,7 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob, } } -static inline void +static inline bool blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length, uint64_t *lba, uint32_t *lba_count) { @@ -2364,8 +2404,10 @@ blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint6 assert(blob->back_bs_dev != NULL); *lba = bs_io_unit_to_back_dev_lba(blob, io_unit); *lba_count = bs_io_unit_to_back_dev_lba(blob, *lba_count); + return false; } else { *lba = bs_blob_io_unit_to_lba(blob, io_unit); + return true; } } @@ -2480,6 +2522,7 @@ blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blo struct spdk_bs_cpl cpl; uint64_t lba; uint32_t lba_count; + bool is_allocated; assert(blob != NULL); @@ -2487,7 +2530,7 @@ blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blo cpl.u.blob_basic.cb_fn = cb_fn; cpl.u.blob_basic.cb_arg = cb_arg; - blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); + is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); if (blob->frozen_refcnt) { /* This blob I/O is frozen */ @@ -2515,7 +2558,7 @@ blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blo return; } - if (bs_io_unit_is_allocated(blob, offset)) { + if (is_allocated) { /* Read from the blob */ bs_batch_read_dev(batch, payload, lba, lba_count); } else { @@ -2528,7 +2571,7 @@ blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blo } case SPDK_BLOB_WRITE: case SPDK_BLOB_WRITE_ZEROES: { - if (bs_io_unit_is_allocated(blob, offset)) { + if (is_allocated) { /* Write to the blob */ spdk_bs_batch_t *batch; @@ -2573,7 +2616,7 @@ blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blo return; } - if (bs_io_unit_is_allocated(blob, offset)) { + if (is_allocated) { bs_batch_unmap_dev(batch, lba, lba_count); } @@ -2745,6 +2788,7 @@ blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_chan if (spdk_likely(length <= bs_num_io_units_to_cluster_boundary(blob, offset))) { uint32_t lba_count; uint64_t lba; + bool is_allocated; cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; cpl.u.blob_basic.cb_fn = cb_fn; @@ -2768,7 +2812,7 @@ blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_chan return; } - blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); + is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); if (read) { spdk_bs_sequence_t *seq; @@ -2779,14 +2823,14 @@ blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_chan return; } - if (bs_io_unit_is_allocated(blob, offset)) { + if (is_allocated) { bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); } else { bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); } } else { - if (bs_io_unit_is_allocated(blob, offset)) { + if (is_allocated) { spdk_bs_sequence_t *seq; seq = bs_sequence_start(_channel, &cpl); @@ -2839,6 +2883,10 @@ blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid) { struct spdk_blob *blob; + if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) { + return NULL; + } + TAILQ_FOREACH(blob, &bs->blobs, link) { if (blob->id == blobid) { return blob; @@ -2947,14 +2995,16 @@ bs_dev_destroy(void *io_device) TAILQ_FOREACH_SAFE(blob, &bs->blobs, link, blob_tmp) { TAILQ_REMOVE(&bs->blobs, blob, link); + spdk_bit_array_clear(bs->open_blobids, blob->id); blob_free(blob); } pthread_mutex_destroy(&bs->used_clusters_mutex); + spdk_bit_array_free(&bs->open_blobids); spdk_bit_array_free(&bs->used_blobids); spdk_bit_array_free(&bs->used_md_pages); - spdk_bit_array_free(&bs->used_clusters); + spdk_bit_pool_free(&bs->used_clusters); /* * If this function is called for any reason except a successful unload, * the unload_cpl type will be NONE and this will be a nop. @@ -3084,10 +3134,43 @@ bs_opts_verify(struct spdk_bs_opts *opts) return 0; } +/* START spdk_bs_load */ + +/* spdk_bs_load_ctx is used for init, load, unload and dump code paths. */ + +struct spdk_bs_load_ctx { + struct spdk_blob_store *bs; + struct spdk_bs_super_block *super; + + struct spdk_bs_md_mask *mask; + bool in_page_chain; + uint32_t page_index; + uint32_t cur_page; + struct spdk_blob_md_page *page; + + uint64_t num_extent_pages; + uint32_t *extent_page_num; + struct spdk_blob_md_page *extent_pages; + struct spdk_bit_array *used_clusters; + + spdk_bs_sequence_t *seq; + spdk_blob_op_with_handle_complete iter_cb_fn; + void *iter_cb_arg; + struct spdk_blob *blob; + spdk_blob_id blobid; + + /* These fields are used in the spdk_bs_dump path. */ + FILE *fp; + spdk_bs_dump_print_xattr print_xattr_fn; + char xattr_name[4096]; +}; + static int -bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs) +bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs, + struct spdk_bs_load_ctx **_ctx) { struct spdk_blob_store *bs; + struct spdk_bs_load_ctx *ctx; uint64_t dev_size; int rc; @@ -3109,6 +3192,24 @@ bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_st return -ENOMEM; } + ctx = calloc(1, sizeof(struct spdk_bs_load_ctx)); + if (!ctx) { + free(bs); + return -ENOMEM; + } + + ctx->bs = bs; + ctx->iter_cb_fn = opts->iter_cb_fn; + ctx->iter_cb_arg = opts->iter_cb_arg; + + ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->super) { + free(ctx); + free(bs); + return -ENOMEM; + } + TAILQ_INIT(&bs->blobs); TAILQ_INIT(&bs->snapshots); bs->dev = dev; @@ -3121,17 +3222,20 @@ bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_st */ bs->cluster_sz = opts->cluster_sz; bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen); + ctx->used_clusters = spdk_bit_array_create(bs->total_clusters); + if (!ctx->used_clusters) { + spdk_free(ctx->super); + free(ctx); + free(bs); + return -ENOMEM; + } + bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE; if (spdk_u32_is_pow2(bs->pages_per_cluster)) { bs->pages_per_cluster_shift = spdk_u32log2(bs->pages_per_cluster); } bs->num_free_clusters = bs->total_clusters; - bs->used_clusters = spdk_bit_array_create(bs->total_clusters); bs->io_unit_size = dev->blocklen; - if (bs->used_clusters == NULL) { - free(bs); - return -ENOMEM; - } bs->max_channel_ops = opts->max_channel_ops; bs->super_blob = SPDK_BLOBID_INVALID; @@ -3140,6 +3244,7 @@ bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_st /* The metadata is assumed to be at least 1 page */ bs->used_md_pages = spdk_bit_array_create(1); bs->used_blobids = spdk_bit_array_create(0); + bs->open_blobids = spdk_bit_array_create(0); pthread_mutex_init(&bs->used_clusters_mutex, NULL); @@ -3149,41 +3254,22 @@ bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_st if (rc == -1) { spdk_io_device_unregister(bs, NULL); pthread_mutex_destroy(&bs->used_clusters_mutex); + spdk_bit_array_free(&bs->open_blobids); spdk_bit_array_free(&bs->used_blobids); spdk_bit_array_free(&bs->used_md_pages); - spdk_bit_array_free(&bs->used_clusters); + spdk_bit_array_free(&ctx->used_clusters); + spdk_free(ctx->super); + free(ctx); free(bs); /* FIXME: this is a lie but don't know how to get a proper error code here */ return -ENOMEM; } + *_ctx = ctx; *_bs = bs; return 0; } -/* START spdk_bs_load, spdk_bs_load_ctx will used for both load and unload. */ - -struct spdk_bs_load_ctx { - struct spdk_blob_store *bs; - struct spdk_bs_super_block *super; - - struct spdk_bs_md_mask *mask; - bool in_page_chain; - uint32_t page_index; - uint32_t cur_page; - struct spdk_blob_md_page *page; - - uint64_t num_extent_pages; - uint32_t *extent_page_num; - struct spdk_blob_md_page *extent_pages; - - spdk_bs_sequence_t *seq; - spdk_blob_op_with_handle_complete iter_cb_fn; - void *iter_cb_arg; - struct spdk_blob *blob; - spdk_blob_id blobid; -}; - static void bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno) { @@ -3192,44 +3278,10 @@ bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno) spdk_free(ctx->super); bs_sequence_finish(ctx->seq, bserrno); bs_free(ctx->bs); + spdk_bit_array_free(&ctx->used_clusters); free(ctx); } -static void -bs_set_mask(struct spdk_bit_array *array, struct spdk_bs_md_mask *mask) -{ - uint32_t i = 0; - - while (true) { - i = spdk_bit_array_find_first_set(array, i); - if (i >= mask->length) { - break; - } - mask->mask[i / 8] |= 1U << (i % 8); - i++; - } -} - -static int -bs_load_mask(struct spdk_bit_array **array_ptr, struct spdk_bs_md_mask *mask) -{ - struct spdk_bit_array *array; - uint32_t i; - - if (spdk_bit_array_resize(array_ptr, mask->length) < 0) { - return -ENOMEM; - } - - array = *array_ptr; - for (i = 0; i < mask->length; i++) { - if (mask->mask[i / 8] & (1U << (i % 8))) { - spdk_bit_array_set(array, i); - } - } - - return 0; -} - static void bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg) @@ -3260,9 +3312,18 @@ bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS; ctx->mask->length = ctx->bs->total_clusters; - assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_clusters)); - - bs_set_mask(ctx->bs->used_clusters, ctx->mask); + /* We could get here through the normal unload path, or through dirty + * shutdown recovery. For the normal unload path, we use the mask from + * the bit pool. For dirty shutdown recovery, we don't have a bit pool yet - + * only the bit array from the load ctx. + */ + if (ctx->bs->used_clusters) { + assert(ctx->mask->length == spdk_bit_pool_capacity(ctx->bs->used_clusters)); + spdk_bit_pool_store_mask(ctx->bs->used_clusters, ctx->mask->mask); + } else { + assert(ctx->mask->length == spdk_bit_array_capacity(ctx->used_clusters)); + spdk_bit_array_store_mask(ctx->used_clusters, ctx->mask->mask); + } lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); @@ -3286,7 +3347,7 @@ bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) ctx->mask->length = ctx->super->md_len; assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages)); - bs_set_mask(ctx->bs->used_md_pages, ctx->mask); + spdk_bit_array_store_mask(ctx->bs->used_md_pages, ctx->mask->mask); lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); @@ -3319,7 +3380,7 @@ bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl c ctx->mask->length = ctx->super->md_len; assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids)); - bs_set_mask(ctx->bs->used_blobids, ctx->mask); + spdk_bit_array_store_mask(ctx->bs->used_blobids, ctx->mask->mask); lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); @@ -3515,6 +3576,7 @@ bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno) static void bs_load_complete(struct spdk_bs_load_ctx *ctx) { + ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); spdk_bs_iter_first(ctx->bs, bs_load_iter, ctx); } @@ -3535,13 +3597,14 @@ bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) * (in pages) of the metadata region */ assert(ctx->mask->length == ctx->super->md_len); - rc = bs_load_mask(&ctx->bs->used_blobids, ctx->mask); + rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->mask->length); if (rc < 0) { spdk_free(ctx->mask); bs_load_ctx_fail(ctx, rc); return; } + spdk_bit_array_load_mask(ctx->bs->used_blobids, ctx->mask->mask); bs_load_complete(ctx); } @@ -3565,14 +3628,15 @@ bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) /* The length of the mask must be exactly equal to the total number of clusters */ assert(ctx->mask->length == ctx->bs->total_clusters); - rc = bs_load_mask(&ctx->bs->used_clusters, ctx->mask); + rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->mask->length); if (rc < 0) { spdk_free(ctx->mask); bs_load_ctx_fail(ctx, rc); return; } - ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->bs->used_clusters); + spdk_bit_array_load_mask(ctx->used_clusters, ctx->mask->mask); + ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->used_clusters); assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters); spdk_free(ctx->mask); @@ -3611,13 +3675,14 @@ bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) /* The length of the mask must be exactly equal to the size (in pages) of the metadata region */ assert(ctx->mask->length == ctx->super->md_len); - rc = bs_load_mask(&ctx->bs->used_md_pages, ctx->mask); + rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->mask->length); if (rc < 0) { spdk_free(ctx->mask); bs_load_ctx_fail(ctx, rc); return; } + spdk_bit_array_load_mask(ctx->bs->used_md_pages, ctx->mask->mask); spdk_free(ctx->mask); /* Read the used clusters mask */ @@ -3684,7 +3749,7 @@ bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_p * in the used cluster map. */ if (cluster_idx != 0) { - spdk_bit_array_set(bs->used_clusters, cluster_idx + j); + spdk_bit_array_set(ctx->used_clusters, cluster_idx + j); if (bs->num_free_clusters == 0) { return -ENOSPC; } @@ -3722,7 +3787,7 @@ bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_p cluster_idx >= desc_extent->start_cluster_idx + cluster_count) { return -EINVAL; } - spdk_bit_array_set(bs->used_clusters, cluster_idx); + spdk_bit_array_set(ctx->used_clusters, cluster_idx); if (bs->num_free_clusters == 0) { return -ENOSPC; } @@ -3926,8 +3991,9 @@ bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx) /* Claim all of the clusters used by the metadata */ num_md_clusters = spdk_divide_round_up(ctx->super->md_len, ctx->bs->pages_per_cluster); for (i = 0; i < num_md_clusters; i++) { - bs_claim_cluster(ctx->bs, i); + spdk_bit_array_set(ctx->used_clusters, i); } + ctx->bs->num_free_clusters -= num_md_clusters; spdk_free(ctx->page); bs_load_write_used_md(ctx); } @@ -3980,7 +4046,7 @@ bs_load_replay_extent_pages(struct spdk_bs_load_ctx *ctx) uint64_t lba; uint64_t i; - ctx->extent_pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE * ctx->num_extent_pages, SPDK_BS_PAGE_SIZE, + ctx->extent_pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE * ctx->num_extent_pages, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); if (!ctx->extent_pages) { bs_load_ctx_fail(ctx, -ENOMEM); @@ -4056,7 +4122,7 @@ bs_load_replay_md(struct spdk_bs_load_ctx *ctx) { ctx->page_index = 0; ctx->cur_page = 0; - ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE, + ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); if (!ctx->page) { bs_load_ctx_fail(ctx, -ENOMEM); @@ -4082,7 +4148,13 @@ bs_recover(struct spdk_bs_load_ctx *ctx) return; } - rc = spdk_bit_array_resize(&ctx->bs->used_clusters, ctx->bs->total_clusters); + rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); + if (rc < 0) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + + rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->super->md_len); if (rc < 0) { bs_load_ctx_fail(ctx, -ENOMEM); return; @@ -4154,7 +4226,7 @@ bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster); } ctx->bs->io_unit_size = ctx->super->io_unit_size; - rc = spdk_bit_array_resize(&ctx->bs->used_clusters, ctx->bs->total_clusters); + rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); if (rc < 0) { bs_load_ctx_fail(ctx, -ENOMEM); return; @@ -4204,34 +4276,13 @@ spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, return; } - err = bs_alloc(dev, &opts, &bs); + err = bs_alloc(dev, &opts, &bs, &ctx); if (err) { dev->destroy(dev); cb_fn(cb_arg, NULL, err); return; } - ctx = calloc(1, sizeof(*ctx)); - if (!ctx) { - bs_free(bs); - cb_fn(cb_arg, NULL, -ENOMEM); - return; - } - - ctx->bs = bs; - ctx->iter_cb_fn = opts.iter_cb_fn; - ctx->iter_cb_arg = opts.iter_cb_arg; - - /* Allocate memory for the super block */ - ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, - SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); - if (!ctx->super) { - free(ctx); - bs_free(bs); - cb_fn(cb_arg, NULL, -ENOMEM); - return; - } - cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; cpl.u.bs_handle.cb_fn = cb_fn; cpl.u.bs_handle.cb_arg = cb_arg; @@ -4256,19 +4307,8 @@ spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, /* START spdk_bs_dump */ -struct spdk_bs_dump_ctx { - struct spdk_blob_store *bs; - struct spdk_bs_super_block *super; - uint32_t cur_page; - struct spdk_blob_md_page *page; - spdk_bs_sequence_t *seq; - FILE *fp; - spdk_bs_dump_print_xattr print_xattr_fn; - char xattr_name[4096]; -}; - static void -bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_dump_ctx *ctx, int bserrno) +bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_load_ctx *ctx, int bserrno) { spdk_free(ctx->super); @@ -4288,7 +4328,7 @@ bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_dump_ctx *ctx, int bserrn static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg); static void -bs_dump_print_md_page(struct spdk_bs_dump_ctx *ctx) +bs_dump_print_md_page(struct spdk_bs_load_ctx *ctx) { uint32_t page_idx = ctx->cur_page; struct spdk_blob_md_page *page = ctx->page; @@ -4391,7 +4431,7 @@ bs_dump_print_md_page(struct spdk_bs_dump_ctx *ctx) static void bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) { - struct spdk_bs_dump_ctx *ctx = cb_arg; + struct spdk_bs_load_ctx *ctx = cb_arg; if (bserrno != 0) { bs_dump_finish(seq, ctx, bserrno); @@ -4415,7 +4455,7 @@ bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg) { - struct spdk_bs_dump_ctx *ctx = cb_arg; + struct spdk_bs_load_ctx *ctx = cb_arg; uint64_t lba; assert(ctx->cur_page < ctx->super->md_len); @@ -4428,7 +4468,7 @@ bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg) static void bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) { - struct spdk_bs_dump_ctx *ctx = cb_arg; + struct spdk_bs_load_ctx *ctx = cb_arg; fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature); if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, @@ -4461,7 +4501,7 @@ bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len); ctx->cur_page = 0; - ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE, + ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); if (!ctx->page) { bs_dump_finish(seq, ctx, -ENOMEM); @@ -4477,7 +4517,7 @@ spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_x struct spdk_blob_store *bs; struct spdk_bs_cpl cpl; spdk_bs_sequence_t *seq; - struct spdk_bs_dump_ctx *ctx; + struct spdk_bs_load_ctx *ctx; struct spdk_bs_opts opts = {}; int err; @@ -4485,34 +4525,16 @@ spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_x spdk_bs_opts_init(&opts); - err = bs_alloc(dev, &opts, &bs); + err = bs_alloc(dev, &opts, &bs, &ctx); if (err) { dev->destroy(dev); cb_fn(cb_arg, err); return; } - ctx = calloc(1, sizeof(*ctx)); - if (!ctx) { - bs_free(bs); - cb_fn(cb_arg, -ENOMEM); - return; - } - - ctx->bs = bs; ctx->fp = fp; ctx->print_xattr_fn = print_xattr_fn; - /* Allocate memory for the super block */ - ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, - SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); - if (!ctx->super) { - free(ctx); - bs_free(bs); - cb_fn(cb_arg, -ENOMEM); - return; - } - cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; cpl.u.bs_basic.cb_fn = cb_fn; cpl.u.bs_basic.cb_arg = cb_arg; @@ -4536,16 +4558,12 @@ spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_x /* START spdk_bs_init */ -struct spdk_bs_init_ctx { - struct spdk_blob_store *bs; - struct spdk_bs_super_block *super; -}; - static void bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) { - struct spdk_bs_init_ctx *ctx = cb_arg; + struct spdk_bs_load_ctx *ctx = cb_arg; + ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); spdk_free(ctx->super); free(ctx); @@ -4555,7 +4573,7 @@ bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) static void bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) { - struct spdk_bs_init_ctx *ctx = cb_arg; + struct spdk_bs_load_ctx *ctx = cb_arg; /* Write super block */ bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0), @@ -4567,7 +4585,7 @@ void spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) { - struct spdk_bs_init_ctx *ctx; + struct spdk_bs_load_ctx *ctx; struct spdk_blob_store *bs; struct spdk_bs_cpl cpl; spdk_bs_sequence_t *seq; @@ -4578,6 +4596,7 @@ spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, uint32_t i; struct spdk_bs_opts opts = {}; int rc; + uint64_t lba, lba_count; SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Initializing blobstore on dev %p\n", dev); @@ -4601,7 +4620,7 @@ spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, return; } - rc = bs_alloc(dev, &opts, &bs); + rc = bs_alloc(dev, &opts, &bs, &ctx); if (rc) { dev->destroy(dev); cb_fn(cb_arg, NULL, rc); @@ -4621,6 +4640,8 @@ spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, } rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len); if (rc < 0) { + spdk_free(ctx->super); + free(ctx); bs_free(bs); cb_fn(cb_arg, NULL, -ENOMEM); return; @@ -4628,29 +4649,22 @@ spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len); if (rc < 0) { + spdk_free(ctx->super); + free(ctx); bs_free(bs); cb_fn(cb_arg, NULL, -ENOMEM); return; } - ctx = calloc(1, sizeof(*ctx)); - if (!ctx) { - bs_free(bs); - cb_fn(cb_arg, NULL, -ENOMEM); - return; - } - - ctx->bs = bs; - - /* Allocate memory for the super block */ - ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, - SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); - if (!ctx->super) { + rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len); + if (rc < 0) { + spdk_free(ctx->super); free(ctx); bs_free(bs); cb_fn(cb_arg, NULL, -ENOMEM); return; } + memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, sizeof(ctx->super->signature)); ctx->super->version = SPDK_BS_VERSION; @@ -4712,6 +4726,7 @@ spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, "please decrease number of pages reserved for metadata " "or increase cluster size.\n"); spdk_free(ctx->super); + spdk_bit_array_free(&ctx->used_clusters); free(ctx); bs_free(bs); cb_fn(cb_arg, NULL, -ENOMEM); @@ -4719,9 +4734,10 @@ spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, } /* Claim all of the clusters used by the metadata */ for (i = 0; i < num_md_clusters; i++) { - bs_claim_cluster(bs, i); + spdk_bit_array_set(ctx->used_clusters, i); } + bs->num_free_clusters -= num_md_clusters; bs->total_data_clusters = bs->num_free_clusters; cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; @@ -4743,18 +4759,23 @@ spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, /* Clear metadata space */ bs_batch_write_zeroes_dev(batch, 0, num_md_lba); - switch (opts.clear_method) { - case BS_CLEAR_WITH_UNMAP: - /* Trim data clusters */ - bs_batch_unmap_dev(batch, num_md_lba, ctx->bs->dev->blockcnt - num_md_lba); - break; - case BS_CLEAR_WITH_WRITE_ZEROES: - /* Write_zeroes to data clusters */ - bs_batch_write_zeroes_dev(batch, num_md_lba, ctx->bs->dev->blockcnt - num_md_lba); - break; - case BS_CLEAR_WITH_NONE: - default: - break; + lba = num_md_lba; + while (lba < ctx->bs->dev->blockcnt) { + lba_count = spdk_min(UINT32_MAX, ctx->bs->dev->blockcnt - lba); + switch (opts.clear_method) { + case BS_CLEAR_WITH_UNMAP: + /* Trim data clusters */ + bs_batch_unmap_dev(batch, lba, lba_count); + break; + case BS_CLEAR_WITH_WRITE_ZEROES: + /* Write_zeroes to data clusters */ + bs_batch_write_zeroes_dev(batch, lba, lba_count); + break; + case BS_CLEAR_WITH_NONE: + default: + break; + } + lba += lba_count; } bs_batch_close(batch); @@ -4767,7 +4788,7 @@ spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, static void bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) { - struct spdk_bs_init_ctx *ctx = cb_arg; + struct spdk_bs_load_ctx *ctx = cb_arg; struct spdk_blob_store *bs = ctx->bs; /* @@ -4790,7 +4811,7 @@ spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, { struct spdk_bs_cpl cpl; spdk_bs_sequence_t *seq; - struct spdk_bs_init_ctx *ctx; + struct spdk_bs_load_ctx *ctx; SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Destroying blobstore\n"); @@ -5407,9 +5428,8 @@ bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno) } static void -bs_clone_snapshot_newblob_cleanup(void *cb_arg, int bserrno) +bs_clone_snapshot_newblob_cleanup(struct spdk_clone_snapshot_ctx *ctx, int bserrno) { - struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; struct spdk_blob *newblob = ctx->new.blob; if (bserrno != 0) { @@ -5833,17 +5853,11 @@ bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bser } static void -bs_inflate_blob_done(void *cb_arg, int bserrno) +bs_inflate_blob_done(struct spdk_clone_snapshot_ctx *ctx) { - struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; struct spdk_blob *_blob = ctx->original.blob; struct spdk_blob *_parent; - if (bserrno != 0) { - bs_clone_snapshot_origblob_cleanup(ctx, bserrno); - return; - } - if (ctx->allocate_all) { /* remove thin provisioning */ bs_blob_list_remove(_blob); @@ -5922,7 +5936,7 @@ bs_inflate_blob_touch_next(void *cb_arg, int bserrno) spdk_blob_io_write(_blob, ctx->channel, NULL, offset, 0, bs_inflate_blob_touch_next, ctx); } else { - bs_inflate_blob_done(cb_arg, bserrno); + bs_inflate_blob_done(ctx); } } @@ -5930,7 +5944,7 @@ static void bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) { struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; - uint64_t lfc; /* lowest free cluster */ + uint64_t clusters_needed; uint64_t i; if (bserrno != 0) { @@ -5965,19 +5979,19 @@ bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) /* Do two passes - one to verify that we can obtain enough clusters * and another to actually claim them. */ - lfc = 0; + clusters_needed = 0; for (i = 0; i < _blob->active.num_clusters; i++) { if (bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) { - lfc = spdk_bit_array_find_first_clear(_blob->bs->used_clusters, lfc); - if (lfc == UINT32_MAX) { - /* No more free clusters. Cannot satisfy the request */ - bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC); - return; - } - lfc++; + clusters_needed++; } } + if (clusters_needed > _blob->bs->num_free_clusters) { + /* Not enough free clusters. Cannot satisfy the request. */ + bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC); + return; + } + ctx->cluster = 0; bs_inflate_blob_touch_next(ctx, 0); } @@ -6184,6 +6198,7 @@ delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno) if (ctx->bserrno != 0) { assert(blob_lookup(ctx->snapshot->bs, ctx->snapshot->id) == NULL); TAILQ_INSERT_HEAD(&ctx->snapshot->bs->blobs, ctx->snapshot, link); + spdk_bit_array_set(ctx->snapshot->bs->open_blobids, ctx->snapshot->id); } ctx->snapshot->locked_operation_in_progress = false; @@ -6576,6 +6591,7 @@ bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno) * Remove the blob from the blob_store list now, to ensure it does not * get returned after this point by blob_lookup(). */ + spdk_bit_array_clear(blob->bs->open_blobids, blob->id); TAILQ_REMOVE(&blob->bs->blobs, blob, link); if (update_clone) { @@ -6621,6 +6637,7 @@ static void bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) { struct spdk_blob *blob = cb_arg; + struct spdk_blob *existing; if (bserrno != 0) { blob_free(blob); @@ -6629,15 +6646,29 @@ bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) return; } + existing = blob_lookup(blob->bs, blob->id); + if (existing) { + blob_free(blob); + existing->open_ref++; + seq->cpl.u.blob_handle.blob = existing; + bs_sequence_finish(seq, 0); + return; + } + blob->open_ref++; + spdk_bit_array_set(blob->bs->open_blobids, blob->id); TAILQ_INSERT_HEAD(&blob->bs->blobs, blob, link); bs_sequence_finish(seq, bserrno); } -static void bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, - struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) +static void +bs_open_blob(struct spdk_blob_store *bs, + spdk_blob_id blobid, + struct spdk_blob_open_opts *opts, + spdk_blob_op_with_handle_complete cb_fn, + void *cb_arg) { struct spdk_blob *blob; struct spdk_bs_cpl cpl; @@ -6878,6 +6909,7 @@ blob_insert_cluster_msg(void *arg) if (ctx->extent_page != 0) { assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); bs_release_md_page(ctx->blob->bs, ctx->extent_page); + ctx->extent_page = 0; } /* Extent page already allocated. * Every cluster allocation, requires just an update of single extent page. */ @@ -6926,6 +6958,7 @@ blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) * remove them again. */ if (blob->active.num_pages > 0) { + spdk_bit_array_clear(blob->bs->open_blobids, blob->id); TAILQ_REMOVE(&blob->bs->blobs, blob, link); } blob_free(blob); @@ -7111,6 +7144,7 @@ blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, struct spdk_xattr_tailq *xattrs; struct spdk_xattr *xattr; size_t desc_size; + void *tmp; blob_verify_md_op(blob); @@ -7134,9 +7168,14 @@ blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, TAILQ_FOREACH(xattr, xattrs, link) { if (!strcmp(name, xattr->name)) { + tmp = malloc(value_len); + if (!tmp) { + return -ENOMEM; + } + free(xattr->value); xattr->value_len = value_len; - xattr->value = malloc(value_len); + xattr->value = tmp; memcpy(xattr->value, value, value_len); blob->state = SPDK_BLOB_STATE_DIRTY; @@ -7149,9 +7188,20 @@ blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, if (!xattr) { return -ENOMEM; } + xattr->name = strdup(name); + if (!xattr->name) { + free(xattr); + return -ENOMEM; + } + xattr->value_len = value_len; xattr->value = malloc(value_len); + if (!xattr->value) { + free(xattr->name); + free(xattr); + return -ENOMEM; + } memcpy(xattr->value, value, value_len); TAILQ_INSERT_TAIL(xattrs, xattr, link); diff --git a/lib/blob/blobstore.h b/lib/blob/blobstore.h index def4346f4e5..0c308ebeda2 100644 --- a/lib/blob/blobstore.h +++ b/lib/blob/blobstore.h @@ -186,8 +186,9 @@ struct spdk_blob_store { struct spdk_bs_dev *dev; struct spdk_bit_array *used_md_pages; - struct spdk_bit_array *used_clusters; + struct spdk_bit_pool *used_clusters; struct spdk_bit_array *used_blobids; + struct spdk_bit_array *open_blobids; pthread_mutex_t used_clusters_mutex; diff --git a/lib/blobfs/blobfs.c b/lib/blobfs/blobfs.c index fde016f6673..3af6b0639f6 100644 --- a/lib/blobfs/blobfs.c +++ b/lib/blobfs/blobfs.c @@ -554,6 +554,7 @@ static void fs_conf_parse(void) { struct spdk_conf_section *sp; + int cache_buffer_shift; sp = spdk_conf_find_section(NULL, "Blobfs"); if (sp == NULL) { @@ -561,9 +562,11 @@ fs_conf_parse(void) return; } - g_fs_cache_buffer_shift = spdk_conf_section_get_intval(sp, "CacheBufferShift"); - if (g_fs_cache_buffer_shift <= 0) { + cache_buffer_shift = spdk_conf_section_get_intval(sp, "CacheBufferShift"); + if (cache_buffer_shift <= 0) { g_fs_cache_buffer_shift = CACHE_BUFFER_SHIFT_DEFAULT; + } else { + g_fs_cache_buffer_shift = cache_buffer_shift; } } @@ -664,10 +667,15 @@ file_alloc(struct spdk_filesystem *fs) return NULL; } + if (pthread_spin_init(&file->lock, 0)) { + free(file->tree); + free(file); + return NULL; + } + file->fs = fs; TAILQ_INIT(&file->open_requests); TAILQ_INIT(&file->sync_requests); - pthread_spin_init(&file->lock, 0); TAILQ_INSERT_TAIL(&fs->files, file, tailq); file->priority = SPDK_FILE_PRIORITY_LOW; return file; @@ -1993,11 +2001,15 @@ spdk_fs_alloc_thread_ctx(struct spdk_filesystem *fs) return NULL; } + if (pthread_spin_init(&ctx->ch.lock, 0)) { + free(ctx); + return NULL; + } + fs_channel_create(fs, &ctx->ch, 512); ctx->ch.send_request = fs->send_request; ctx->ch.sync = 1; - pthread_spin_init(&ctx->ch.lock, 0); return ctx; } @@ -2088,7 +2100,7 @@ _blobfs_cache_pool_reclaim(void *arg) int rc; if (!blobfs_cache_pool_need_reclaim()) { - return 0; + return SPDK_POLLER_IDLE; } TAILQ_FOREACH_SAFE(file, &g_caches, cache_tailq, tmp) { @@ -2099,7 +2111,7 @@ _blobfs_cache_pool_reclaim(void *arg) continue; } if (!blobfs_cache_pool_need_reclaim()) { - return 1; + return SPDK_POLLER_BUSY; } break; } @@ -2112,7 +2124,7 @@ _blobfs_cache_pool_reclaim(void *arg) continue; } if (!blobfs_cache_pool_need_reclaim()) { - return 1; + return SPDK_POLLER_BUSY; } break; } @@ -2126,7 +2138,7 @@ _blobfs_cache_pool_reclaim(void *arg) break; } - return 1; + return SPDK_POLLER_BUSY; } static void diff --git a/lib/blobfs/tree.c b/lib/blobfs/tree.c index 32779766f4c..be9417386ff 100644 --- a/lib/blobfs/tree.c +++ b/lib/blobfs/tree.c @@ -87,6 +87,7 @@ tree_insert_buffer(struct cache_tree *root, struct cache_buffer *buffer) while (offset >= CACHE_TREE_LEVEL_SIZE(root->level + 1)) { if (root->present_mask != 0) { tree = calloc(1, sizeof(*tree)); + assert(tree != NULL); tree->level = root->level + 1; tree->u.tree[0] = root; root = tree; @@ -103,6 +104,7 @@ tree_insert_buffer(struct cache_tree *root, struct cache_buffer *buffer) offset &= CACHE_TREE_LEVEL_MASK(tree->level); if (tree->u.tree[index] == NULL) { tree->u.tree[index] = calloc(1, sizeof(*tree)); + assert(tree->u.tree[index] != NULL); tree->u.tree[index]->level = tree->level - 1; tree->present_mask |= (1ULL << index); } diff --git a/lib/conf/Makefile b/lib/conf/Makefile index 667f72a13b8..09966ea1205 100644 --- a/lib/conf/Makefile +++ b/lib/conf/Makefile @@ -35,7 +35,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk SO_VER := 2 -SO_MINOR := 0 +SO_MINOR := 1 C_SRCS = conf.c LIBNAME = conf diff --git a/lib/conf/conf.c b/lib/conf/conf.c index 7492bba8930..287e157a5e7 100644 --- a/lib/conf/conf.c +++ b/lib/conf/conf.c @@ -60,9 +60,11 @@ struct spdk_conf { char *file; struct spdk_conf_section *current_section; struct spdk_conf_section *section; + bool merge_sections; }; #define CF_DELIM " \t" +#define CF_DELIM_KEY " \t=" #define LIB_MAX_TMPBUF 1024 @@ -71,7 +73,13 @@ static struct spdk_conf *default_config = NULL; struct spdk_conf * spdk_conf_allocate(void) { - return calloc(1, sizeof(struct spdk_conf)); + struct spdk_conf *ret = calloc(1, sizeof(struct spdk_conf)); + + if (ret) { + ret->merge_sections = true; + } + + return ret; } static void @@ -479,7 +487,12 @@ parse_line(struct spdk_conf *cp, char *lp) num = 0; } - sp = spdk_conf_find_section(cp, key); + if (cp->merge_sections) { + sp = spdk_conf_find_section(cp, key); + } else { + sp = NULL; + } + if (sp == NULL) { sp = allocate_cf_section(); append_cf_section(cp, sp); @@ -501,7 +514,7 @@ parse_line(struct spdk_conf *cp, char *lp) SPDK_ERRLOG("unknown section\n"); return -1; } - key = spdk_strsepq(&arg, CF_DELIM); + key = spdk_strsepq(&arg, CF_DELIM_KEY); if (key == NULL) { SPDK_ERRLOG("broken key\n"); return -1; @@ -683,3 +696,9 @@ spdk_conf_set_as_default(struct spdk_conf *cp) { default_config = cp; } + +void +spdk_conf_disable_sections_merge(struct spdk_conf *cp) +{ + cp->merge_sections = false; +} diff --git a/lib/conf/spdk_conf.map b/lib/conf/spdk_conf.map index 094b9d67b2f..0fc01c8aa2c 100644 --- a/lib/conf/spdk_conf.map +++ b/lib/conf/spdk_conf.map @@ -17,6 +17,7 @@ spdk_conf_section_get_intval; spdk_conf_section_get_boolval; spdk_conf_set_as_default; + spdk_conf_disable_sections_merge; local: *; }; diff --git a/lib/env_dpdk/Makefile b/lib/env_dpdk/Makefile index df3a4aaa477..11433fe8642 100644 --- a/lib/env_dpdk/Makefile +++ b/lib/env_dpdk/Makefile @@ -34,12 +34,12 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk -SO_VER := 4 +SO_VER := 5 SO_MINOR := 0 CFLAGS += $(ENV_CFLAGS) C_SRCS = env.c memory.c pci.c init.c threads.c -C_SRCS += pci_nvme.c pci_ioat.c pci_virtio.c pci_vmd.c pci_idxd.c +C_SRCS += pci_ioat.c pci_virtio.c pci_vmd.c pci_idxd.c LIBNAME = env_dpdk SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_env_dpdk.map) diff --git a/lib/env_dpdk/env.c b/lib/env_dpdk/env.c index b10bbb5c112..6294bbe51b5 100644 --- a/lib/env_dpdk/env.c +++ b/lib/env_dpdk/env.c @@ -34,6 +34,7 @@ #include "spdk/stdinc.h" #include "spdk/util.h" #include "spdk/env_dpdk.h" +#include "spdk/log.h" #include "env_internal.h" @@ -70,7 +71,7 @@ spdk_malloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint3 buf = rte_malloc_socket(NULL, size, align, socket_id); if (buf && phys_addr) { #ifdef DEBUG - fprintf(stderr, "phys_addr param in spdk_*malloc() is deprecated\n"); + SPDK_ERRLOG("phys_addr param in spdk_*malloc() is deprecated\n"); #endif *phys_addr = virt_to_phys(buf); } @@ -150,14 +151,9 @@ spdk_memzone_reserve_aligned(const char *name, size_t len, int socket_id, const struct rte_memzone *mz; unsigned dpdk_flags = 0; -#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) - /* Older DPDKs do not offer such flag since their - * memzones are iova-contiguous by default. - */ if ((flags & SPDK_MEMZONE_NO_IOVA_CONTIG) == 0) { dpdk_flags |= RTE_MEMZONE_IOVA_CONTIG; } -#endif if (socket_id == SPDK_ENV_SOCKET_ID_ANY) { socket_id = SOCKET_ID_ANY; diff --git a/lib/env_dpdk/env.mk b/lib/env_dpdk/env.mk index 5f21a839d8b..c2bfb0d1942 100644 --- a/lib/env_dpdk/env.mk +++ b/lib/env_dpdk/env.mk @@ -48,10 +48,10 @@ DPDK_INC_DIR := $(DPDK_ABS_DIR)/include/dpdk endif DPDK_INC := -I$(DPDK_INC_DIR) -ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_eal.a)) -DPDK_LIB_EXT = .a -else +ifeq ($(CONFIG_SHARED),y) DPDK_LIB_EXT = .so +else +DPDK_LIB_EXT = .a endif DPDK_LIB_LIST = rte_eal rte_mempool rte_ring rte_mbuf @@ -78,6 +78,11 @@ ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_bus_pci.*)) DPDK_LIB_LIST += rte_bus_pci endif +# DPDK 20.05 eal dependency +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_telemetry.*)) +DPDK_LIB_LIST += rte_telemetry +endif + # There are some complex dependencies when using crypto, reduce or both so # here we add the feature specific ones and set a flag to add the common # ones after that. @@ -89,7 +94,7 @@ endif ifeq ($(CONFIG_REDUCE),y) DPDK_FRAMEWORK=y -DPDK_LIB_LIST += rte_pmd_isal_comp +DPDK_LIB_LIST += rte_pmd_isal endif ifeq ($(DPDK_FRAMEWORK),y) @@ -139,7 +144,8 @@ else ENV_DPDK_FILE = $(call spdk_lib_list_to_static_libs,env_dpdk) endif ENV_LIBS = $(ENV_DPDK_FILE) $(DPDK_LIB) -ENV_LINKER_ARGS = $(call dpdk_env_linker_args,$(DPDK_LIB_LIST)) +ENV_LINKER_ARGS = -Wl,-rpath-link $(DPDK_ABS_DIR)/lib +ENV_LINKER_ARGS += $(call dpdk_env_linker_args,$(DPDK_LIB_LIST)) ifeq ($(CONFIG_IPSEC_MB),y) ENV_LINKER_ARGS += -lIPSec_MB -L$(IPSEC_MB_DIR) @@ -155,6 +161,13 @@ ENV_LINKER_ARGS += -lnuma endif endif +# DPDK built with meson puts those defines elsewhere +ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_build_config.h)) +ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_build_config.h)) +ENV_LINKER_ARGS += -lnuma +endif +endif + ifeq ($(OS),Linux) ENV_LINKER_ARGS += -ldl endif diff --git a/lib/env_dpdk/env_internal.h b/lib/env_dpdk/env_internal.h index 1d7845a98c6..c7900d9d3aa 100644 --- a/lib/env_dpdk/env_internal.h +++ b/lib/env_dpdk/env_internal.h @@ -46,8 +46,8 @@ #include #include -#if RTE_VERSION < RTE_VERSION_NUM(17, 11, 0, 0) -#error RTE_VERSION is too old! Minimum 17.11 is required. +#if RTE_VERSION < RTE_VERSION_NUM(18, 11, 0, 0) +#error RTE_VERSION is too old! Minimum 18.11 is required. #endif /* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47], @@ -59,25 +59,24 @@ #define SHIFT_1GB 30 /* (1 << 30) == 1 GB */ #define MASK_1GB ((1ULL << SHIFT_1GB) - 1) -#define SPDK_PMD_REGISTER_PCI(pci_drv) \ -__attribute__((constructor)) static void pci_drv ## _register(void) \ -{ \ - pci_driver_register(&pci_drv); \ -} - +#define SPDK_PCI_DRIVER_MAX_NAME_LEN 32 struct spdk_pci_driver { struct rte_pci_driver driver; + + const char *name; + const struct spdk_pci_id *id_table; + uint32_t drv_flags; + spdk_pci_enum_cb cb_fn; void *cb_arg; - bool is_registered; TAILQ_ENTRY(spdk_pci_driver) tailq; }; -void pci_driver_register(struct spdk_pci_driver *driver); int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device); int pci_device_fini(struct rte_pci_device *device); void pci_env_init(void); +void pci_env_reinit(void); void pci_env_fini(void); int mem_map_init(bool legacy_mem); int vtophys_init(void); diff --git a/lib/env_dpdk/init.c b/lib/env_dpdk/init.c index eb0d7baa808..d5eaf050be0 100644 --- a/lib/env_dpdk/init.c +++ b/lib/env_dpdk/init.c @@ -37,6 +37,7 @@ #include "spdk/version.h" #include "spdk/env_dpdk.h" +#include "spdk/log.h" #include #include @@ -105,23 +106,6 @@ _sprintf_alloc(const char *format, ...) return NULL; } -static void -env_unlink_shared_files(void) -{ - /* Starting with DPDK 18.05, there are more files with unpredictable paths - * and filenames. The --no-shconf option prevents from creating them, but - * only for DPDK 18.08+. For DPDK 18.05 we just leave them be. - */ -#if RTE_VERSION < RTE_VERSION_NUM(18, 05, 0, 0) - char buffer[PATH_MAX]; - - snprintf(buffer, PATH_MAX, "/var/run/.spdk_pid%d_hugepage_info", getpid()); - if (unlink(buffer)) { - fprintf(stderr, "Unable to unlink shared memory file: %s. Error code: %d\n", buffer, errno); - } -#endif -} - void spdk_env_opts_init(struct spdk_env_opts *opts) { @@ -145,6 +129,10 @@ free_args(char **args, int argcount) { int i; + if (args == NULL) { + return; + } + for (i = 0; i < argcount; i++) { free(args[i]); } @@ -160,7 +148,7 @@ push_arg(char *args[], int *argcount, char *arg) char **tmp; if (arg == NULL) { - fprintf(stderr, "%s: NULL arg supplied\n", __func__); + SPDK_ERRLOG("%s: NULL arg supplied\n", __func__); free_args(args, *argcount); return NULL; } @@ -348,16 +336,6 @@ build_eal_cmdline(const struct spdk_env_opts *opts) } } -#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) && RTE_VERSION < RTE_VERSION_NUM(18, 5, 1, 0) - /* Dynamic memory management is buggy in DPDK 18.05.0. Don't use it. */ - if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { - args = push_arg(args, &argcount, _sprintf_alloc("--legacy-mem")); - if (args == NULL) { - return -1; - } - } -#endif - if (opts->num_pci_addr) { size_t i; char bdf[32]; @@ -375,8 +353,6 @@ build_eal_cmdline(const struct spdk_env_opts *opts) } } - /* The following log-level options are not understood by older DPDKs */ -#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages. * This can be overridden by specifying the same option in opts->env_context */ @@ -402,7 +378,6 @@ build_eal_cmdline(const struct spdk_env_opts *opts) if (args == NULL) { return -1; } -#endif if (opts->env_context) { args = push_arg(args, &argcount, strdup(opts->env_context)); @@ -413,36 +388,43 @@ build_eal_cmdline(const struct spdk_env_opts *opts) #ifdef __linux__ - /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa, - * but DPDK guesses it should be iova-mode=va. Add a check and force - * iova-mode=pa here. */ - if (rte_vfio_noiommu_is_enabled()) { - args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); + if (opts->iova_mode) { + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode)); if (args == NULL) { return -1; } - } + } else { + /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa, + * but DPDK guesses it should be iova-mode=va. Add a check and force + * iova-mode=pa here. */ + if (rte_vfio_noiommu_is_enabled()) { + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); + if (args == NULL) { + return -1; + } + } #if defined(__x86_64__) - /* DPDK by default guesses that it should be using iova-mode=va so that it can - * support running as an unprivileged user. However, some systems (especially - * virtual machines) don't have an IOMMU capable of handling the full virtual - * address space and DPDK doesn't currently catch that. Add a check in SPDK - * and force iova-mode=pa here. */ - if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) { + /* DPDK by default guesses that it should be using iova-mode=va so that it can + * support running as an unprivileged user. However, some systems (especially + * virtual machines) don't have an IOMMU capable of handling the full virtual + * address space and DPDK doesn't currently catch that. Add a check in SPDK + * and force iova-mode=pa here. */ + if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) { + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); + if (args == NULL) { + return -1; + } + } +#elif defined(__PPC64__) + /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly + * auto-detect at the moment, so we'll just force it here. */ args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); if (args == NULL) { return -1; } - } -#elif defined(__PPC64__) - /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly - * auto-detect at the moment, so we'll just force it here. */ - args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); - if (args == NULL) { - return -1; - } #endif + } /* Set the base virtual address - it must be an address that is not in the @@ -505,13 +487,13 @@ spdk_env_dpdk_post_init(bool legacy_mem) rc = mem_map_init(legacy_mem); if (rc < 0) { - fprintf(stderr, "Failed to allocate mem_map\n"); + SPDK_ERRLOG("Failed to allocate mem_map\n"); return rc; } rc = vtophys_init(); if (rc < 0) { - fprintf(stderr, "Failed to initialize vtophys\n"); + SPDK_ERRLOG("Failed to initialize vtophys\n"); return rc; } @@ -524,6 +506,8 @@ spdk_env_dpdk_post_fini(void) pci_env_fini(); free_args(g_eal_cmdline, g_eal_cmdline_argcount); + g_eal_cmdline = NULL; + g_eal_cmdline_argcount = 0; } int @@ -534,20 +518,38 @@ spdk_env_init(const struct spdk_env_opts *opts) int orig_optind; bool legacy_mem; - g_external_init = false; + /* If SPDK env has been initialized before, then only pci env requires + * reinitialization. + */ + if (g_external_init == false) { + if (opts != NULL) { + fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n"); + return -EINVAL; + } + + printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version()); + pci_env_reinit(); + + return 0; + } + + if (opts == NULL) { + fprintf(stderr, "NULL arguments to initialize DPDK\n"); + return -EINVAL; + } rc = build_eal_cmdline(opts); if (rc < 0) { - fprintf(stderr, "Invalid arguments to initialize DPDK\n"); + SPDK_ERRLOG("Invalid arguments to initialize DPDK\n"); return -EINVAL; } - printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); - printf("[ DPDK EAL parameters: "); + SPDK_PRINTF("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); + SPDK_PRINTF("[ DPDK EAL parameters: "); for (i = 0; i < g_eal_cmdline_argcount; i++) { - printf("%s ", g_eal_cmdline[i]); + SPDK_PRINTF("%s ", g_eal_cmdline[i]); } - printf("]\n"); + SPDK_PRINTF("]\n"); /* DPDK rearranges the array we pass to it, so make a copy * before passing so we can still free the individual strings @@ -555,7 +557,7 @@ spdk_env_init(const struct spdk_env_opts *opts) */ dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *)); if (dpdk_args == NULL) { - fprintf(stderr, "Failed to allocate dpdk_args\n"); + SPDK_ERRLOG("Failed to allocate dpdk_args\n"); return -ENOMEM; } memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount); @@ -570,30 +572,24 @@ spdk_env_init(const struct spdk_env_opts *opts) if (rc < 0) { if (rte_errno == EALREADY) { - fprintf(stderr, "DPDK already initialized\n"); + SPDK_ERRLOG("DPDK already initialized\n"); } else { - fprintf(stderr, "Failed to initialize DPDK\n"); + SPDK_ERRLOG("Failed to initialize DPDK\n"); } return -rte_errno; } - if (opts->shm_id < 0 && !opts->hugepage_single_segments) { - /* - * Unlink hugepage and config info files after init. This will ensure they get - * deleted on app exit, even if the app crashes and does not exit normally. - * Only do this when not in multi-process mode, since for multi-process other - * apps will need to open these files. These files are not created for - * "single file segments". - */ - env_unlink_shared_files(); - } - legacy_mem = false; if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) { legacy_mem = true; } - return spdk_env_dpdk_post_init(legacy_mem); + rc = spdk_env_dpdk_post_init(legacy_mem); + if (rc == 0) { + g_external_init = false; + } + + return rc; } void diff --git a/lib/env_dpdk/memory.c b/lib/env_dpdk/memory.c index 92d2b3982a3..304bc3e1e1f 100644 --- a/lib/env_dpdk/memory.c +++ b/lib/env_dpdk/memory.c @@ -47,6 +47,7 @@ #include "spdk/util.h" #include "spdk/memory.h" #include "spdk/env_dpdk.h" +#include "spdk/log.h" #ifdef __FreeBSD__ #define VFIO_ENABLED 0 @@ -87,7 +88,7 @@ static struct vfio_cfg g_vfio = { #endif #if DEBUG -#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) +#define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__) #else #define DEBUG_PRINT(...) #endif @@ -703,7 +704,6 @@ spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t return orig_translation; } -#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) static void memory_hotplug_cb(enum rte_mem_event event_type, const void *addr, size_t len, void *arg) @@ -751,7 +751,6 @@ memory_iter_cb(const struct rte_memseg_list *msl, { return spdk_mem_register(ms->addr, len); } -#endif int mem_map_init(bool legacy_mem) @@ -768,24 +767,8 @@ mem_map_init(bool legacy_mem) * Walk all DPDK memory segments and register them * with the master memory map */ -#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); rte_memseg_contig_walk(memory_iter_cb, NULL); -#else - struct rte_mem_config *mcfg; - size_t seg_idx; - - mcfg = rte_eal_get_configuration()->mem_config; - for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { - struct rte_memseg *seg = &mcfg->memseg[seg_idx]; - - if (seg->addr == NULL) { - break; - } - - spdk_mem_register(seg->addr, seg->len); - } -#endif return 0; } @@ -938,7 +921,6 @@ vtophys_get_paddr_memseg(uint64_t vaddr) uintptr_t paddr; struct rte_memseg *seg; -#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); if (seg != NULL) { paddr = seg->phys_addr; @@ -948,28 +930,6 @@ vtophys_get_paddr_memseg(uint64_t vaddr) paddr += (vaddr - (uintptr_t)seg->addr); return paddr; } -#else - struct rte_mem_config *mcfg; - uint32_t seg_idx; - - mcfg = rte_eal_get_configuration()->mem_config; - for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { - seg = &mcfg->memseg[seg_idx]; - if (seg->addr == NULL) { - break; - } - - if (vaddr >= (uintptr_t)seg->addr && - vaddr < ((uintptr_t)seg->addr + seg->len)) { - paddr = seg->phys_addr; - if (paddr == RTE_BAD_IOVA) { - return SPDK_VTOPHYS_ERROR; - } - paddr += (vaddr - (uintptr_t)seg->addr); - return paddr; - } - } -#endif return SPDK_VTOPHYS_ERROR; } diff --git a/lib/env_dpdk/pci.c b/lib/env_dpdk/pci.c index f43e5272bad..43b0e9f359a 100644 --- a/lib/env_dpdk/pci.c +++ b/lib/env_dpdk/pci.c @@ -34,7 +34,9 @@ #include "env_internal.h" #include +#include #include "spdk/env.h" +#include "spdk/log.h" #define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers" @@ -47,7 +49,6 @@ #define DPDK_HOTPLUG_RETRY_COUNT 4 /* DPDK alarm/interrupt thread */ -static pthread_t g_dpdk_tid; static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER; static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices); /* devices hotplugged on a dpdk thread */ @@ -81,10 +82,6 @@ cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t of rc = rte_pci_read_config(dev->dev_handle, value, len, offset); -#if defined(__FreeBSD__) && RTE_VERSION < RTE_VERSION_NUM(18, 11, 0, 0) - /* Older DPDKs return 0 on success and -1 on failure */ - return rc; -#endif return (rc > 0 && (uint32_t) rc == len) ? 0 : -1; } @@ -103,11 +100,8 @@ cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t o } static void -detach_rte_cb(void *_dev) +remove_rte_dev(struct rte_pci_device *rte_dev) { - struct rte_pci_device *rte_dev = _dev; - -#if RTE_VERSION >= RTE_VERSION_NUM(18, 11, 0, 0) char bdf[32]; int i = 0, rc; @@ -115,9 +109,12 @@ detach_rte_cb(void *_dev) do { rc = rte_eal_hotplug_remove("pci", bdf); } while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT); -#else - rte_eal_dev_detach(&rte_dev->device); -#endif +} + +static void +detach_rte_cb(void *_dev) +{ + remove_rte_dev(_dev); } static void @@ -127,17 +124,20 @@ detach_rte(struct spdk_pci_device *dev) int i; bool removed; - /* The device was already marked as available and could be attached - * again while we go asynchronous, so we explicitly forbid that. - */ - dev->internal.pending_removal = true; - if (!spdk_process_is_primary() || pthread_equal(g_dpdk_tid, pthread_self())) { - detach_rte_cb(rte_dev); + if (!spdk_process_is_primary()) { + remove_rte_dev(rte_dev); return; } + pthread_mutex_lock(&g_pci_mutex); + dev->internal.attached = false; + /* prevent the hotremove notification from removing this device */ + dev->internal.pending_removal = true; + pthread_mutex_unlock(&g_pci_mutex); + rte_eal_alarm_set(1, detach_rte_cb, rte_dev); - /* wait up to 2s for the cb to finish executing */ + + /* wait up to 2s for the cb to execute */ for (i = 2000; i > 0; i--) { spdk_delay_us(1000); @@ -165,66 +165,86 @@ detach_rte(struct spdk_pci_device *dev) removed = dev->internal.removed; pthread_mutex_unlock(&g_pci_mutex); if (!removed) { - fprintf(stderr, "Timeout waiting for DPDK to remove PCI device %s.\n", - rte_dev->name); + SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n", + rte_dev->name); /* If we reach this state, then the device couldn't be removed and most likely a subsequent hot add of a device in the same BDF will fail */ } } void -pci_driver_register(struct spdk_pci_driver *driver) +spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags) { + struct spdk_pci_driver *driver; + + driver = calloc(1, sizeof(*driver)); + if (!driver) { + /* we can't do any better than bailing atm */ + return; + } + + driver->name = name; + driver->id_table = id_table; + driver->drv_flags = flags; TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq); } -#if RTE_VERSION >= RTE_VERSION_NUM(18, 5, 0, 0) -static void -pci_device_rte_hotremove_cb(void *dev) +struct spdk_pci_driver * +spdk_pci_nvme_get_driver(void) { - detach_rte((struct spdk_pci_device *)dev); + return spdk_pci_get_driver("nvme"); +} + +struct spdk_pci_driver * +spdk_pci_get_driver(const char *name) +{ + struct spdk_pci_driver *driver; + + TAILQ_FOREACH(driver, &g_pci_drivers, tailq) { + if (strcmp(driver->name, name) == 0) { + return driver; + } + } + + return NULL; } static void -pci_device_rte_hotremove(const char *device_name, +pci_device_rte_dev_event(const char *device_name, enum rte_dev_event_type event, void *cb_arg) { struct spdk_pci_device *dev; bool can_detach = false; - if (event != RTE_DEV_EVENT_REMOVE) { - return; - } - - pthread_mutex_lock(&g_pci_mutex); - TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { - struct rte_pci_device *rte_dev = dev->dev_handle; - if (strcmp(rte_dev->name, device_name) == 0 && - !dev->internal.pending_removal) { - can_detach = !dev->internal.attached; - /* prevent any further attaches */ - dev->internal.pending_removal = true; - break; + switch (event) { + default: + case RTE_DEV_EVENT_ADD: + /* Nothing to do here yet. */ + break; + case RTE_DEV_EVENT_REMOVE: + pthread_mutex_lock(&g_pci_mutex); + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + struct rte_pci_device *rte_dev = dev->dev_handle; + + if (strcmp(rte_dev->name, device_name) == 0 && + !dev->internal.pending_removal) { + can_detach = !dev->internal.attached; + /* prevent any further attaches */ + dev->internal.pending_removal = true; + break; + } } - } - pthread_mutex_unlock(&g_pci_mutex); + pthread_mutex_unlock(&g_pci_mutex); - if (dev != NULL && can_detach) { - /* If device is not attached, we can remove it right away. - * - * Because the user's callback is invoked in eal interrupt - * callback, the interrupt callback need to be finished before - * it can be unregistered when detaching device. So finish - * callback soon and use a deferred removal to detach device - * is need. It is a workaround, once the device detaching be - * moved into the eal in the future, the deferred removal could - * be deleted. - */ - rte_eal_alarm_set(1, pci_device_rte_hotremove_cb, dev); + if (dev != NULL && can_detach) { + /* if device is not attached we can remove it right away. + * Otherwise it will be removed at detach. */ + remove_rte_dev(dev->dev_handle); + } + break; } } -#endif static void cleanup_pci_devices(void) @@ -252,49 +272,104 @@ cleanup_pci_devices(void) pthread_mutex_unlock(&g_pci_mutex); } -static void -_get_alarm_thread_cb(void *unused) +static int scan_pci_bus(bool delay_init); + +/* translate spdk_pci_driver to an rte_pci_driver and register it to dpdk */ +static int +register_rte_driver(struct spdk_pci_driver *driver) +{ + unsigned pci_id_count = 0; + struct rte_pci_id *rte_id_table; + char *rte_name; + size_t rte_name_len; + uint32_t rte_flags; + + assert(driver->id_table); + while (driver->id_table[pci_id_count].vendor_id) { + pci_id_count++; + } + assert(pci_id_count > 0); + + rte_id_table = calloc(pci_id_count + 1, sizeof(*rte_id_table)); + if (!rte_id_table) { + return -ENOMEM; + } + + while (pci_id_count > 0) { + struct rte_pci_id *rte_id = &rte_id_table[pci_id_count - 1]; + const struct spdk_pci_id *spdk_id = &driver->id_table[pci_id_count - 1]; + + rte_id->class_id = spdk_id->class_id; + rte_id->vendor_id = spdk_id->vendor_id; + rte_id->device_id = spdk_id->device_id; + rte_id->subsystem_vendor_id = spdk_id->subvendor_id; + rte_id->subsystem_device_id = spdk_id->subdevice_id; + pci_id_count--; + } + + assert(driver->name); + rte_name_len = strlen(driver->name) + strlen("spdk_") + 1; + rte_name = calloc(rte_name_len, 1); + if (!rte_name) { + free(rte_id_table); + return -ENOMEM; + } + + snprintf(rte_name, rte_name_len, "spdk_%s", driver->name); + driver->driver.driver.name = rte_name; + driver->driver.id_table = rte_id_table; + + rte_flags = 0; + if (driver->drv_flags & SPDK_PCI_DRIVER_NEED_MAPPING) { + rte_flags |= RTE_PCI_DRV_NEED_MAPPING; + } + if (driver->drv_flags & SPDK_PCI_DRIVER_WC_ACTIVATE) { + rte_flags |= RTE_PCI_DRV_WC_ACTIVATE; + } + driver->driver.drv_flags = rte_flags; + + driver->driver.probe = pci_device_init; + driver->driver.remove = pci_device_fini; + + rte_pci_register(&driver->driver); + return 0; +} + +static inline void +_pci_env_init(void) { - g_dpdk_tid = pthread_self(); + /* We assume devices were present on the bus for more than 2 seconds + * before initializing SPDK and there's no need to wait more. We scan + * the bus, but we don't blacklist any devices. + */ + scan_pci_bus(false); + + /* Register a single hotremove callback for all devices. */ + if (spdk_process_is_primary()) { + rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL); + } } void pci_env_init(void) { -#if RTE_VERSION >= RTE_VERSION_NUM(18, 11, 0, 0) struct spdk_pci_driver *driver; - /* We need to pre-register pci drivers for the pci devices to be - * attachable in multi-process with DPDK 18.11+. - * - * DPDK 18.11+ does its best to ensure all devices are equally - * attached or detached in all processes within a shared memory group. - * For SPDK it means that if a device is hotplugged in the primary, - * then DPDK will automatically send an IPC hotplug request to all other - * processes. Those other processes may not have the same SPDK PCI - * driver registered and may fail to attach the device. DPDK will send - * back the failure status, and the the primary process will also fail - * to hotplug the device. To prevent that, we need to pre-register the - * pci drivers here. - */ TAILQ_FOREACH(driver, &g_pci_drivers, tailq) { - assert(!driver->is_registered); - driver->is_registered = true; - rte_pci_register(&driver->driver); + register_rte_driver(driver); } -#endif -#if RTE_VERSION >= RTE_VERSION_NUM(18, 5, 0, 0) - /* Register a single hotremove callback for all devices. */ - if (spdk_process_is_primary()) { - rte_dev_event_callback_register(NULL, pci_device_rte_hotremove, NULL); - } -#endif + _pci_env_init(); +} - rte_eal_alarm_set(1, _get_alarm_thread_cb, NULL); - /* alarms are executed in order, so this one will be always executed - * before any real hotremove alarms and we don't need to wait for it. +void +pci_env_reinit(void) +{ + /* There is no need to register pci drivers again, since they were + * already pre-registered in pci_env_init. */ + + _pci_env_init(); } void @@ -307,15 +382,13 @@ pci_env_fini(void) TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { if (dev->internal.attached) { spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr); - fprintf(stderr, "Device %s is still attached at shutdown!\n", bdf); + SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf); } } -#if RTE_VERSION >= RTE_VERSION_NUM(18, 5, 0, 0) if (spdk_process_is_primary()) { - rte_dev_event_callback_unregister(NULL, pci_device_rte_hotremove, NULL); + rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL); } -#endif } int @@ -326,15 +399,6 @@ pci_device_init(struct rte_pci_driver *_drv, struct spdk_pci_device *dev; int rc; -#if RTE_VERSION < RTE_VERSION_NUM(18, 11, 0, 0) - if (!driver->cb_fn) { - /* Return a positive value to indicate that this device does - * not belong to this driver, but this isn't an error. - */ - return 1; - } -#endif - dev = calloc(1, sizeof(*dev)); if (dev == NULL) { return -1; @@ -346,6 +410,7 @@ pci_device_init(struct rte_pci_driver *_drv, dev->addr.bus = _dev->addr.bus; dev->addr.dev = _dev->addr.devid; dev->addr.func = _dev->addr.function; + dev->id.class_id = _dev->id.class_id; dev->id.vendor_id = _dev->id.vendor_id; dev->id.device_id = _dev->id.device_id; dev->id.subvendor_id = _dev->id.subsystem_vendor_id; @@ -357,7 +422,6 @@ pci_device_init(struct rte_pci_driver *_drv, dev->unmap_bar = unmap_bar_rte; dev->cfg_read = cfg_read_rte; dev->cfg_write = cfg_write_rte; - dev->detach = detach_rte; dev->internal.driver = driver; dev->internal.claim_fd = -1; @@ -395,6 +459,11 @@ pci_device_fini(struct rte_pci_device *_dev) return -1; } + /* remove our whitelist_at option */ + if (_dev->device.devargs) { + _dev->device.devargs->data = NULL; + } + assert(!dev->internal.removed); dev->internal.removed = true; pthread_mutex_unlock(&g_pci_mutex); @@ -411,18 +480,92 @@ spdk_pci_device_detach(struct spdk_pci_device *dev) spdk_pci_device_unclaim(dev); } - dev->internal.attached = false; - dev->detach(dev); + if (strcmp(dev->type, "pci") == 0) { + /* if it's a physical device we need to deal with DPDK on + * a different process and we can't just unset one flag + * here. We also want to stop using any device resources + * so that the device isn't "in use" by the userspace driver + * once we detach it. This would allow attaching the device + * to a different process, or to a kernel driver like nvme. + */ + detach_rte(dev); + } else { + dev->internal.attached = false; + } cleanup_pci_devices(); } +static int +scan_pci_bus(bool delay_init) +{ + struct spdk_pci_driver *driver; + struct rte_pci_device *rte_dev; + uint64_t now; + + rte_bus_scan(); + now = spdk_get_ticks(); + + driver = TAILQ_FIRST(&g_pci_drivers); + if (!driver) { + return 0; + } + + TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) { + struct rte_devargs *da; + + da = rte_dev->device.devargs; + if (!da) { + char devargs_str[128]; + + /* the device was never blacklisted or whitelisted */ + da = calloc(1, sizeof(*da)); + if (!da) { + return -1; + } + + snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name); + if (rte_devargs_parse(da, devargs_str) != 0) { + free(da); + return -1; + } + + rte_devargs_insert(&da); + rte_dev->device.devargs = da; + } + + if (da->data) { + uint64_t whitelist_at = (uint64_t)(uintptr_t)da->data; + + /* this device was seen by spdk before... */ + if (da->policy == RTE_DEV_BLACKLISTED && whitelist_at <= now) { + da->policy = RTE_DEV_WHITELISTED; + } + } else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_WHITELIST && + da->policy == RTE_DEV_WHITELISTED) || da->policy != RTE_DEV_BLACKLISTED) { + /* override the policy only if not permanently blacklisted */ + + if (delay_init) { + da->policy = RTE_DEV_BLACKLISTED; + da->data = (void *)(now + 2 * spdk_get_ticks_hz()); + } else { + da->policy = RTE_DEV_WHITELISTED; + da->data = (void *)(uintptr_t)now; + } + } + } + + return 0; +} + int spdk_pci_device_attach(struct spdk_pci_driver *driver, spdk_pci_enum_cb enum_cb, void *enum_ctx, struct spdk_pci_addr *pci_address) { struct spdk_pci_device *dev; + struct rte_pci_device *rte_dev; + struct rte_devargs *da; int rc; char bdf[32]; @@ -451,15 +594,9 @@ spdk_pci_device_attach(struct spdk_pci_driver *driver, return rc; } - if (!driver->is_registered) { - driver->is_registered = true; - rte_pci_register(&driver->driver); - } - driver->cb_fn = enum_cb; driver->cb_arg = enum_ctx; -#if RTE_VERSION >= RTE_VERSION_NUM(18, 11, 0, 0) int i = 0; do { @@ -472,15 +609,34 @@ spdk_pci_device_attach(struct spdk_pci_driver *driver, */ rc = 0; } -#else - rc = rte_eal_dev_attach(bdf, ""); -#endif driver->cb_arg = NULL; driver->cb_fn = NULL; cleanup_pci_devices(); - return rc == 0 ? 0 : -1; + + if (rc != 0) { + return -1; + } + + /* explicit attach ignores the whitelist, so if we blacklisted this + * device before let's enable it now - just for clarity. + */ + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) { + break; + } + } + assert(dev != NULL); + + rte_dev = dev->dev_handle; + da = rte_dev->device.devargs; + if (da && da->data) { + da->data = (void *)(uintptr_t)spdk_get_ticks(); + da->policy = RTE_DEV_WHITELISTED; + } + + return 0; } /* Note: You can call spdk_pci_enumerate from more than one thread @@ -515,15 +671,14 @@ spdk_pci_enumerate(struct spdk_pci_driver *driver, } pthread_mutex_unlock(&g_pci_mutex); - if (!driver->is_registered) { - driver->is_registered = true; - rte_pci_register(&driver->driver); + if (scan_pci_bus(true) != 0) { + return -1; } driver->cb_fn = enum_cb; driver->cb_arg = enum_ctx; - if (rte_bus_scan() != 0 || rte_bus_probe() != 0) { + if (rte_bus_probe() != 0) { driver->cb_arg = NULL; driver->cb_fn = NULL; return -1; @@ -770,12 +925,12 @@ spdk_pci_device_claim(struct spdk_pci_device *dev) dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); if (dev_fd == -1) { - fprintf(stderr, "could not open %s\n", dev_name); + SPDK_ERRLOG("could not open %s\n", dev_name); return -errno; } if (ftruncate(dev_fd, sizeof(int)) != 0) { - fprintf(stderr, "could not truncate %s\n", dev_name); + SPDK_ERRLOG("could not truncate %s\n", dev_name); close(dev_fd); return -errno; } @@ -783,15 +938,15 @@ spdk_pci_device_claim(struct spdk_pci_device *dev) dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, MAP_SHARED, dev_fd, 0); if (dev_map == MAP_FAILED) { - fprintf(stderr, "could not mmap dev %s (%d)\n", dev_name, errno); + SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno); close(dev_fd); return -errno; } if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) { pid = *(int *)dev_map; - fprintf(stderr, "Cannot create lock on device %s, probably" - " process %d has claimed it\n", dev_name, pid); + SPDK_ERRLOG("Cannot create lock on device %s, probably" + " process %d has claimed it\n", dev_name, pid); munmap(dev_map, sizeof(int)); close(dev_fd); /* F_SETLK returns unspecified errnos, normalize them */ @@ -894,7 +1049,6 @@ spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev) assert(dev->unmap_bar != NULL); assert(dev->cfg_read != NULL); assert(dev->cfg_write != NULL); - assert(dev->detach != NULL); dev->internal.driver = drv; TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq); } diff --git a/lib/env_dpdk/pci_idxd.c b/lib/env_dpdk/pci_idxd.c index 42f3da1ce7b..a6e4427af92 100644 --- a/lib/env_dpdk/pci_idxd.c +++ b/lib/env_dpdk/pci_idxd.c @@ -35,30 +35,16 @@ #include "spdk/pci_ids.h" -#define SPDK_IDXD_PCI_DEVICE(DEVICE_ID) RTE_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID) -static struct rte_pci_id idxd_driver_id[] = { +#define SPDK_IDXD_PCI_DEVICE(DEVICE_ID) SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID) +static struct spdk_pci_id idxd_driver_id[] = { {SPDK_IDXD_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IDXD)}, { .vendor_id = 0, /* sentinel */ }, }; -static struct spdk_pci_driver g_idxd_pci_drv = { - .driver = { - .drv_flags = RTE_PCI_DRV_NEED_MAPPING, - .id_table = idxd_driver_id, - .probe = pci_device_init, - .remove = pci_device_fini, - .driver.name = "spdk_idxd", - }, - - .cb_fn = NULL, - .cb_arg = NULL, - .is_registered = false, -}; - struct spdk_pci_driver * spdk_pci_idxd_get_driver(void) { - return &g_idxd_pci_drv; + return spdk_pci_get_driver("idxd"); } -SPDK_PMD_REGISTER_PCI(g_idxd_pci_drv); +SPDK_PCI_DRIVER_REGISTER(idxd, idxd_driver_id, SPDK_PCI_DRIVER_NEED_MAPPING); diff --git a/lib/env_dpdk/pci_ioat.c b/lib/env_dpdk/pci_ioat.c index 74875557bd1..1b6788a6fd4 100644 --- a/lib/env_dpdk/pci_ioat.c +++ b/lib/env_dpdk/pci_ioat.c @@ -35,8 +35,8 @@ #include "spdk/pci_ids.h" -#define SPDK_IOAT_PCI_DEVICE(DEVICE_ID) RTE_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID) -static struct rte_pci_id ioat_driver_id[] = { +#define SPDK_IOAT_PCI_DEVICE(DEVICE_ID) SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID) +static struct spdk_pci_id ioat_driver_id[] = { {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB0)}, {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB1)}, {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB2)}, @@ -89,24 +89,10 @@ static struct rte_pci_id ioat_driver_id[] = { { .vendor_id = 0, /* sentinel */ }, }; -static struct spdk_pci_driver g_ioat_pci_drv = { - .driver = { - .drv_flags = RTE_PCI_DRV_NEED_MAPPING, - .id_table = ioat_driver_id, - .probe = pci_device_init, - .remove = pci_device_fini, - .driver.name = "spdk_ioat", - }, - - .cb_fn = NULL, - .cb_arg = NULL, - .is_registered = false, -}; - struct spdk_pci_driver * spdk_pci_ioat_get_driver(void) { - return &g_ioat_pci_drv; + return spdk_pci_get_driver("ioat"); } -SPDK_PMD_REGISTER_PCI(g_ioat_pci_drv); +SPDK_PCI_DRIVER_REGISTER(ioat, ioat_driver_id, SPDK_PCI_DRIVER_NEED_MAPPING); diff --git a/lib/env_dpdk/pci_virtio.c b/lib/env_dpdk/pci_virtio.c index 857edb7d77e..c30beaeb54d 100644 --- a/lib/env_dpdk/pci_virtio.c +++ b/lib/env_dpdk/pci_virtio.c @@ -35,36 +35,19 @@ #include "spdk/pci_ids.h" -static struct rte_pci_id virtio_pci_driver_id[] = { - { RTE_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_MODERN) }, - { RTE_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_MODERN) }, - { RTE_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_LEGACY) }, - { RTE_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_LEGACY) }, +static struct spdk_pci_id virtio_pci_driver_id[] = { + { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_MODERN) }, + { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_MODERN) }, + { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_LEGACY) }, + { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_LEGACY) }, { .vendor_id = 0, /* sentinel */ }, }; -static struct spdk_pci_driver g_virtio_pci_drv = { - .driver = { - .drv_flags = RTE_PCI_DRV_NEED_MAPPING -#if RTE_VERSION >= RTE_VERSION_NUM(18, 8, 0, 0) - | RTE_PCI_DRV_WC_ACTIVATE -#endif - , - .id_table = virtio_pci_driver_id, - .probe = pci_device_init, - .remove = pci_device_fini, - .driver.name = "spdk_virtio", - }, - - .cb_fn = NULL, - .cb_arg = NULL, - .is_registered = false, -}; - struct spdk_pci_driver * spdk_pci_virtio_get_driver(void) { - return &g_virtio_pci_drv; + return spdk_pci_get_driver("virtio"); } -SPDK_PMD_REGISTER_PCI(g_virtio_pci_drv); +SPDK_PCI_DRIVER_REGISTER(virtio, virtio_pci_driver_id, + SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE); diff --git a/lib/env_dpdk/pci_vmd.c b/lib/env_dpdk/pci_vmd.c index 331e2471865..19289e051fe 100644 --- a/lib/env_dpdk/pci_vmd.c +++ b/lib/env_dpdk/pci_vmd.c @@ -35,33 +35,16 @@ #include "spdk/pci_ids.h" -static struct rte_pci_id vmd_pci_driver_id[] = { - { RTE_PCI_DEVICE(SPDK_PCI_VID_INTEL, PCI_DEVICE_ID_INTEL_VMD) }, +static struct spdk_pci_id vmd_pci_driver_id[] = { + { SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, PCI_DEVICE_ID_INTEL_VMD) }, { .vendor_id = 0, /* sentinel */ }, }; -static struct spdk_pci_driver g_vmd_pci_drv = { - .driver = { - .drv_flags = RTE_PCI_DRV_NEED_MAPPING -#if RTE_VERSION >= RTE_VERSION_NUM(18, 8, 0, 0) - | RTE_PCI_DRV_WC_ACTIVATE -#endif - , - .id_table = vmd_pci_driver_id, - .probe = pci_device_init, - .remove = pci_device_fini, - .driver.name = "spdk_vmd", - }, - - .cb_fn = NULL, - .cb_arg = NULL, - .is_registered = false, -}; - struct spdk_pci_driver * spdk_pci_vmd_get_driver(void) { - return &g_vmd_pci_drv; + return spdk_pci_get_driver("vmd"); } -SPDK_PMD_REGISTER_PCI(g_vmd_pci_drv); +SPDK_PCI_DRIVER_REGISTER(vmd, vmd_pci_driver_id, + SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE); diff --git a/lib/env_dpdk/spdk_env_dpdk.map b/lib/env_dpdk/spdk_env_dpdk.map index 028796e1c54..a465f093872 100644 --- a/lib/env_dpdk/spdk_env_dpdk.map +++ b/lib/env_dpdk/spdk_env_dpdk.map @@ -51,6 +51,8 @@ spdk_ring_dequeue; spdk_iommu_is_enabled; spdk_vtophys; + spdk_pci_get_driver; + spdk_pci_driver_register; spdk_pci_nvme_get_driver; spdk_pci_vmd_get_driver; spdk_pci_idxd_get_driver; diff --git a/lib/event/Makefile b/lib/event/Makefile index 9be0da3518e..87a6209c740 100644 --- a/lib/event/Makefile +++ b/lib/event/Makefile @@ -34,7 +34,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk -SO_VER := 3 +SO_VER := 5 SO_MINOR := 0 LIBNAME = event diff --git a/lib/event/app.c b/lib/event/app.c index ad1a4ef8135..8b5a342858d 100644 --- a/lib/event/app.c +++ b/lib/event/app.c @@ -53,6 +53,7 @@ #define SPDK_APP_DPDK_DEFAULT_MASTER_CORE -1 #define SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL -1 #define SPDK_APP_DPDK_DEFAULT_CORE_MASK "0x1" +#define SPDK_APP_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 #define SPDK_APP_DEFAULT_CORE_LIMIT 0x140000000 /* 5 GiB */ struct spdk_app { @@ -110,7 +111,7 @@ static const struct option g_cmdline_options[] = { {"version", no_argument, NULL, VERSION_OPT_IDX}, #define PCI_BLACKLIST_OPT_IDX 'B' {"pci-blacklist", required_argument, NULL, PCI_BLACKLIST_OPT_IDX}, -#define LOGFLAG_OPT_IDX 'L' +#define LOGFLAG_OPT_IDX 'L' {"logflag", required_argument, NULL, LOGFLAG_OPT_IDX}, #define HUGE_UNLINK_OPT_IDX 'R' {"huge-unlink", no_argument, NULL, HUGE_UNLINK_OPT_IDX}, @@ -130,6 +131,10 @@ static const struct option g_cmdline_options[] = { {"json", required_argument, NULL, JSON_CONFIG_OPT_IDX}, #define JSON_CONFIG_IGNORE_INIT_ERRORS_IDX 263 {"json-ignore-init-errors", no_argument, NULL, JSON_CONFIG_IGNORE_INIT_ERRORS_IDX}, +#define IOVA_MODE_OPT_IDX 264 + {"iova-mode", required_argument, NULL, IOVA_MODE_OPT_IDX}, +#define BASE_VIRTADDR_OPT_IDX 265 + {"base-virtaddr", required_argument, NULL, BASE_VIRTADDR_OPT_IDX}, }; /* Global section */ @@ -284,6 +289,7 @@ spdk_app_opts_init(struct spdk_app_opts *opts) opts->master_core = SPDK_APP_DPDK_DEFAULT_MASTER_CORE; opts->mem_channel = SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL; opts->reactor_mask = NULL; + opts->base_virtaddr = SPDK_APP_DPDK_DEFAULT_BASE_VIRTADDR; opts->print_level = SPDK_APP_DEFAULT_LOG_PRINT_LEVEL; opts->rpc_addr = SPDK_DEFAULT_RPC_ADDR; opts->num_entries = SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES; @@ -309,8 +315,8 @@ app_setup_signal_handlers(struct spdk_app_opts *opts) } /* Install the same handler for SIGINT and SIGTERM */ + g_shutdown_sig_received = false; sigact.sa_handler = __shutdown_signal; - rc = sigaction(SIGINT, &sigact, NULL); if (rc < 0) { SPDK_ERRLOG("sigaction(SIGINT) failed\n"); @@ -491,6 +497,15 @@ app_setup_env(struct spdk_app_opts *opts) struct spdk_env_opts env_opts = {}; int rc; + if (opts == NULL) { + rc = spdk_env_init(NULL); + if (rc != 0) { + SPDK_ERRLOG("Unable to reinitialize SPDK env\n"); + } + + return rc; + } + spdk_env_opts_init(&env_opts); env_opts.name = opts->name; @@ -506,7 +521,9 @@ app_setup_env(struct spdk_app_opts *opts) env_opts.num_pci_addr = opts->num_pci_addr; env_opts.pci_blacklist = opts->pci_blacklist; env_opts.pci_whitelist = opts->pci_whitelist; + env_opts.base_virtaddr = opts->base_virtaddr; env_opts.env_context = opts->env_context; + env_opts.iova_mode = opts->iova_mode; rc = spdk_env_init(&env_opts); free(env_opts.pci_blacklist); @@ -581,6 +598,7 @@ spdk_app_start(struct spdk_app_opts *opts, spdk_msg_fn start_fn, int rc; char *tty; struct spdk_cpuset tmp_cpumask = {}; + static bool g_env_was_setup = false; if (!opts) { SPDK_ERRLOG("opts should not be NULL\n"); @@ -636,7 +654,10 @@ spdk_app_start(struct spdk_app_opts *opts, spdk_msg_fn start_fn, spdk_log_set_level(SPDK_APP_DEFAULT_LOG_LEVEL); - if (app_setup_env(opts) < 0) { + /* Pass NULL to app_setup_env if SPDK app has been set up, in order to + * indicate that this is a reinitialization. + */ + if (app_setup_env(g_env_was_setup ? NULL : opts) < 0) { return 1; } @@ -687,6 +708,8 @@ spdk_app_start(struct spdk_app_opts *opts, spdk_msg_fn start_fn, /* This blocks until spdk_app_stop is called */ spdk_reactors_start(); + g_env_was_setup = true; + return g_spdk_app.rc; } @@ -761,6 +784,8 @@ usage(void (*app_usage)(void)) printf(" -W, --pci-whitelist \n"); printf(" pci addr to whitelist (-B and -W cannot be used at the same time)\n"); printf(" --huge-dir use a specific hugetlbfs mount to reserve memory from\n"); + printf(" --iova-mode set IOVA mode ('pa' for IOVA_PA and 'va' for IOVA_VA)\n"); + printf(" --base-virtaddr the base virtual address for DPDK (default: 0x200000000000)\n"); printf(" --num-trace-entries number of trace entries for each core, must be power of 2. (default %d)\n", SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES); spdk_log_usage(stdout, "-L"); @@ -971,9 +996,21 @@ spdk_app_parse_args(int argc, char **argv, struct spdk_app_opts *opts, goto out; } break; + case BASE_VIRTADDR_OPT_IDX: + tmp = spdk_strtoll(optarg, 0); + if (tmp <= 0) { + SPDK_ERRLOG("Invalid base-virtaddr %s\n", optarg); + usage(app_usage); + goto out; + } + opts->base_virtaddr = (uint64_t)tmp; + break; case HUGE_DIR_OPT_IDX: opts->hugedir = optarg; break; + case IOVA_MODE_OPT_IDX: + opts->iova_mode = optarg; + break; case NUM_TRACE_ENTRIES_OPT_IDX: tmp = spdk_strtoll(optarg, 0); if (tmp <= 0) { @@ -1109,7 +1146,7 @@ rpc_subsystem_init_poller_ctx(void *ctx) free(poller_ctx); } - return 1; + return SPDK_POLLER_BUSY; } static void diff --git a/lib/event/json_config.c b/lib/event/json_config.c index 684490301d0..0edcd1827ea 100644 --- a/lib/event/json_config.c +++ b/lib/event/json_config.c @@ -158,6 +158,25 @@ rpc_client_check_timeout(struct load_json_config_ctx *ctx) return 0; } +struct json_write_buf { + char data[1024]; + unsigned cur_off; +}; + +static int +json_write_stdout(void *cb_ctx, const void *data, size_t size) +{ + struct json_write_buf *buf = cb_ctx; + size_t rc; + + rc = snprintf(buf->data + buf->cur_off, sizeof(buf->data) - buf->cur_off, + "%s", (const char *)data); + if (rc > 0) { + buf->cur_off += rc; + } + return rc == size ? 0 : -1; +} + static int rpc_client_poller(void *arg) { @@ -179,17 +198,27 @@ rpc_client_poller(void *arg) if (rc == 0) { /* No response yet */ - return -1; + return SPDK_POLLER_BUSY; } else if (rc < 0) { app_json_config_load_done(ctx, rc); - return -1; + return SPDK_POLLER_BUSY; } resp = spdk_jsonrpc_client_get_response(ctx->client_conn); assert(resp); if (resp->error) { - SPDK_ERRLOG("error response: %.*s", (int)resp->error->len, (char *)resp->error->start); + struct json_write_buf buf = {}; + struct spdk_json_write_ctx *w = spdk_json_write_begin(json_write_stdout, + &buf, SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE); + + if (w == NULL) { + SPDK_ERRLOG("error response: (?)\n"); + } else { + spdk_json_write_val(w, resp->error); + spdk_json_write_end(w); + SPDK_ERRLOG("error response: \n%s\n", buf.data); + } } if (resp->error && ctx->stop_on_error) { @@ -206,7 +235,7 @@ rpc_client_poller(void *arg) } - return -1; + return SPDK_POLLER_BUSY; } static int @@ -226,9 +255,11 @@ rpc_client_connect_poller(void *_ctx) if (rc) { app_json_config_load_done(ctx, rc); } + + return SPDK_POLLER_IDLE; } - return -1; + return SPDK_POLLER_BUSY; } static int @@ -322,7 +353,7 @@ app_json_config_load_subsystem_config_entry(void *_ctx) struct spdk_json_write_ctx *w; struct config_entry cfg = {}; struct spdk_json_val *params_end; - size_t params_len; + size_t params_len = 0; int rc; if (ctx->config_it == NULL) { @@ -336,10 +367,7 @@ app_json_config_load_subsystem_config_entry(void *_ctx) if (spdk_json_decode_object(ctx->config_it, jsonrpc_cmd_decoders, SPDK_COUNTOF(jsonrpc_cmd_decoders), &cfg)) { - params_end = spdk_json_next(ctx->config_it); - assert(params_end != NULL); - params_len = params_end->start - ctx->config->start + 1; - SPDK_ERRLOG("Failed to decode config entry: %.*s!\n", (int)params_len, (char *)ctx->config_it); + SPDK_ERRLOG("Failed to decode config entry\n"); app_json_config_load_done(ctx, -EINVAL); goto out; } @@ -353,14 +381,17 @@ app_json_config_load_subsystem_config_entry(void *_ctx) goto out; } - /* Get _END by skipping params and going back by one element. */ - params_end = cfg.params + spdk_json_val_len(cfg.params) - 1; + SPDK_DEBUG_APP_CFG("\tmethod: %s\n", cfg.method); - /* Need to add one character to include '}' */ - params_len = params_end->start - cfg.params->start + 1; + if (cfg.params) { + /* Get _END by skipping params and going back by one element. */ + params_end = cfg.params + spdk_json_val_len(cfg.params) - 1; - SPDK_DEBUG_APP_CFG("\tmethod: %s\n", cfg.method); - SPDK_DEBUG_APP_CFG("\tparams: %.*s\n", (int)params_len, (char *)cfg.params->start); + /* Need to add one character to include '}' */ + params_len = params_end->start - cfg.params->start + 1; + + SPDK_DEBUG_APP_CFG("\tparams: %.*s\n", (int)params_len, (char *)cfg.params->start); + } rpc_request = spdk_jsonrpc_client_create_request(); if (!rpc_request) { @@ -377,10 +408,13 @@ app_json_config_load_subsystem_config_entry(void *_ctx) spdk_json_write_named_string(w, "method", cfg.method); - /* No need to parse "params". Just dump the whole content of "params" - * directly into the request and let the remote side verify it. */ - spdk_json_write_name(w, "params"); - spdk_json_write_val_raw(w, cfg.params->start, params_len); + if (cfg.params) { + /* No need to parse "params". Just dump the whole content of "params" + * directly into the request and let the remote side verify it. */ + spdk_json_write_name(w, "params"); + spdk_json_write_val_raw(w, cfg.params->start, params_len); + } + spdk_jsonrpc_end_request(rpc_request, w); rc = client_send_request(ctx, rpc_request, app_json_config_load_subsystem_config_entry_next); diff --git a/lib/event/reactor.c b/lib/event/reactor.c index cda4a32b1e7..9df8a7d9eca 100644 --- a/lib/event/reactor.c +++ b/lib/event/reactor.c @@ -494,20 +494,11 @@ static void _schedule_thread(void *arg1, void *arg2) { struct spdk_lw_thread *lw_thread = arg1; - struct spdk_thread *thread; - struct spdk_cpuset *cpumask; struct spdk_reactor *reactor; uint32_t current_core; current_core = spdk_env_get_current_core(); - thread = spdk_thread_get_from_ctx(lw_thread); - cpumask = spdk_thread_get_cpumask(thread); - if (!spdk_cpuset_get_cpu(cpumask, current_core)) { - SPDK_ERRLOG("Thread was scheduled to the wrong core %d\n", current_core); - assert(false); - } - reactor = spdk_reactor_get(current_core); assert(reactor != NULL); diff --git a/lib/event/rpc.c b/lib/event/rpc.c index dc3523a29d4..a42d5ebeb45 100644 --- a/lib/event/rpc.c +++ b/lib/event/rpc.c @@ -49,7 +49,7 @@ static int rpc_subsystem_poll(void *arg) { spdk_rpc_accept(); - return -1; + return SPDK_POLLER_BUSY; } void diff --git a/lib/ftl/ftl_core.c b/lib/ftl/ftl_core.c index 69721f01af3..a356486d0e3 100644 --- a/lib/ftl/ftl_core.c +++ b/lib/ftl/ftl_core.c @@ -146,8 +146,6 @@ ftl_remove_wptr(struct ftl_wptr *wptr) ftl_wptr_free(wptr); } -static void ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry); - static struct ftl_wbuf_entry * ftl_acquire_wbuf_entry(struct ftl_io_channel *io_channel, int io_flags) { @@ -910,7 +908,7 @@ ftl_cache_lba_valid(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry) return true; } -static void +void ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry) { pthread_spin_lock(&entry->lock); @@ -1944,15 +1942,15 @@ ftl_wptr_process_writes(struct ftl_wptr *wptr) return 0; } -static int +static bool ftl_process_writes(struct spdk_ftl_dev *dev) { struct ftl_wptr *wptr, *twptr; - size_t num_active = 0; + size_t num_active = 0, num_writes = 0; enum ftl_band_state state; LIST_FOREACH_SAFE(wptr, &dev->wptr_list, list_entry, twptr) { - ftl_wptr_process_writes(wptr); + num_writes += ftl_wptr_process_writes(wptr); state = wptr->band->state; if (state != FTL_BAND_STATE_FULL && @@ -1966,7 +1964,7 @@ ftl_process_writes(struct spdk_ftl_dev *dev) ftl_add_wptr(dev); } - return 0; + return num_writes != 0; } static void @@ -2106,7 +2104,7 @@ ftl_select_defrag_band(struct spdk_ftl_dev *dev) return mband; } -static void +static bool ftl_process_relocs(struct spdk_ftl_dev *dev) { struct ftl_band *band; @@ -2119,7 +2117,7 @@ ftl_process_relocs(struct spdk_ftl_dev *dev) } } - ftl_reloc(dev->reloc); + return ftl_reloc(dev->reloc); } int @@ -2412,7 +2410,7 @@ ftl_io_channel_poll(void *arg) TAILQ_HEAD(, ftl_io) retry_queue; if (TAILQ_EMPTY(&ch->write_cmpl_queue) && TAILQ_EMPTY(&ch->retry_queue)) { - return 0; + return SPDK_POLLER_IDLE; } while (!TAILQ_EMPTY(&ch->write_cmpl_queue)) { @@ -2438,25 +2436,25 @@ ftl_io_channel_poll(void *arg) } } - return 1; + return SPDK_POLLER_BUSY; } int ftl_task_core(void *ctx) { struct spdk_ftl_dev *dev = ctx; + bool busy; if (dev->halt) { if (ftl_shutdown_complete(dev)) { spdk_poller_unregister(&dev->core_poller); - return 0; + return SPDK_POLLER_IDLE; } } - ftl_process_writes(dev); - ftl_process_relocs(dev); + busy = ftl_process_writes(dev) || ftl_process_relocs(dev); - return 0; + return busy ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; } SPDK_LOG_REGISTER_COMPONENT("ftl_core", SPDK_LOG_FTL_CORE) diff --git a/lib/ftl/ftl_core.h b/lib/ftl/ftl_core.h index 454dc78dc83..b782ba7310f 100644 --- a/lib/ftl/ftl_core.h +++ b/lib/ftl/ftl_core.h @@ -298,6 +298,7 @@ int ftl_nv_cache_scrub(struct ftl_nv_cache *nv_cache, spdk_bdev_io_completion_cb void *cb_arg); void ftl_get_media_events(struct spdk_ftl_dev *dev); int ftl_io_channel_poll(void *arg); +void ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry); struct spdk_io_channel *ftl_get_io_channel(const struct spdk_ftl_dev *dev); struct ftl_io_channel *ftl_io_channel_get_ctx(struct spdk_io_channel *ioch); diff --git a/lib/ftl/ftl_init.c b/lib/ftl/ftl_init.c index 1f5d29ec4b1..15a8c21c970 100644 --- a/lib/ftl/ftl_init.c +++ b/lib/ftl/ftl_init.c @@ -133,13 +133,18 @@ static int ftl_band_init_md(struct ftl_band *band) { struct ftl_lba_map *lba_map = &band->lba_map; + int rc; lba_map->vld = spdk_bit_array_create(ftl_get_num_blocks_in_band(band->dev)); if (!lba_map->vld) { return -ENOMEM; } - pthread_spin_init(&lba_map->lock, PTHREAD_PROCESS_PRIVATE); + rc = pthread_spin_init(&lba_map->lock, PTHREAD_PROCESS_PRIVATE); + if (rc) { + spdk_bit_array_free(&lba_map->vld); + return rc; + } ftl_band_md_clear(band); return 0; } @@ -1207,6 +1212,7 @@ _ftl_io_channel_destroy_cb(void *ctx) { struct ftl_io_channel *ioch = ctx; struct spdk_ftl_dev *dev = ioch->dev; + uint32_t i; /* Do not destroy the channel if some of its entries are still in use */ if (spdk_ring_count(ioch->free_queue) != ioch->num_entries) { @@ -1214,6 +1220,11 @@ _ftl_io_channel_destroy_cb(void *ctx) return; } + /* Evict all valid entries from cache */ + for (i = 0; i < ioch->num_entries; ++i) { + ftl_evict_cache_entry(dev, &ioch->wbuf_entries[i]); + } + spdk_poller_unregister(&ioch->poller); spdk_put_io_channel(ioch->base_ioch); @@ -1626,7 +1637,7 @@ ftl_halt_poller(void *ctx) } } - return 0; + return SPDK_POLLER_BUSY; } static void diff --git a/lib/ftl/ftl_reloc.c b/lib/ftl/ftl_reloc.c index e59bf4d81c3..66b7bd64dab 100644 --- a/lib/ftl/ftl_reloc.c +++ b/lib/ftl/ftl_reloc.c @@ -747,20 +747,20 @@ ftl_reloc_resume(struct ftl_reloc *reloc) reloc->halt = false; } -void +bool ftl_reloc(struct ftl_reloc *reloc) { struct ftl_band_reloc *breloc, *tbreloc; if (ftl_reloc_is_halted(reloc)) { - return; + return false; } /* Process first band from priority queue and return */ breloc = TAILQ_FIRST(&reloc->prio_queue); if (breloc) { ftl_process_reloc(breloc); - return; + return true; } TAILQ_FOREACH_SAFE(breloc, &reloc->pending_queue, entry, tbreloc) { @@ -784,6 +784,8 @@ ftl_reloc(struct ftl_reloc *reloc) assert(breloc->state == FTL_BAND_RELOC_STATE_ACTIVE); ftl_process_reloc(breloc); } + + return reloc->num_active != 0; } void diff --git a/lib/ftl/ftl_reloc.h b/lib/ftl/ftl_reloc.h index 21f49a47d2d..96e2ea736e8 100644 --- a/lib/ftl/ftl_reloc.h +++ b/lib/ftl/ftl_reloc.h @@ -44,7 +44,7 @@ struct ftl_reloc *ftl_reloc_init(struct spdk_ftl_dev *dev); void ftl_reloc_free(struct ftl_reloc *reloc); void ftl_reloc_add(struct ftl_reloc *reloc, struct ftl_band *band, size_t offset, size_t num_blocks, int prio, bool is_defrag); -void ftl_reloc(struct ftl_reloc *reloc); +bool ftl_reloc(struct ftl_reloc *reloc); void ftl_reloc_halt(struct ftl_reloc *reloc); void ftl_reloc_resume(struct ftl_reloc *reloc); bool ftl_reloc_is_halted(const struct ftl_reloc *reloc); diff --git a/lib/idxd/idxd.c b/lib/idxd/idxd.c index 33e83d3ef16..992d9621169 100644 --- a/lib/idxd/idxd.c +++ b/lib/idxd/idxd.c @@ -104,6 +104,8 @@ struct spdk_idxd_io_channel * spdk_idxd_get_channel(struct spdk_idxd_device *idxd) { struct spdk_idxd_io_channel *chan; + struct idxd_batch *batch; + int i; chan = calloc(1, sizeof(struct spdk_idxd_io_channel)); if (chan == NULL) { @@ -112,6 +114,22 @@ spdk_idxd_get_channel(struct spdk_idxd_device *idxd) } chan->idxd = idxd; + TAILQ_INIT(&chan->batches); + + TAILQ_INIT(&chan->batch_pool); + for (i = 0 ; i < NUM_BATCHES ; i++) { + batch = calloc(1, sizeof(struct idxd_batch)); + if (batch == NULL) { + SPDK_ERRLOG("Failed to allocate batch\n"); + while ((batch = TAILQ_FIRST(&chan->batch_pool))) { + TAILQ_REMOVE(&chan->batch_pool, batch, link); + free(batch); + } + return NULL; + } + TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link); + } + return chan; } @@ -125,7 +143,9 @@ int spdk_idxd_configure_chan(struct spdk_idxd_io_channel *chan) { uint32_t num_ring_slots; + int rc; + /* Round robin the WQ selection for the chan on this IDXD device. */ chan->idxd->wq_id++; if (chan->idxd->wq_id == g_dev_cfg->total_wqs) { chan->idxd->wq_id = 0; @@ -148,13 +168,13 @@ spdk_idxd_configure_chan(struct spdk_idxd_io_channel *chan) /* Store the original size of the ring. */ chan->ring_ctrl.ring_size = num_ring_slots; - chan->ring_ctrl.data_desc = spdk_zmalloc(num_ring_slots * sizeof(struct idxd_hw_desc), - 0x40, NULL, - SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); - if (chan->ring_ctrl.data_desc == NULL) { + chan->ring_ctrl.desc = spdk_zmalloc(num_ring_slots * sizeof(struct idxd_hw_desc), + 0x40, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (chan->ring_ctrl.desc == NULL) { SPDK_ERRLOG("Failed to allocate descriptor memory\n"); - spdk_bit_array_free(&chan->ring_ctrl.ring_slots); - return -ENOMEM; + rc = -ENOMEM; + goto err_desc; } chan->ring_ctrl.completions = spdk_zmalloc(num_ring_slots * sizeof(struct idxd_comp), @@ -162,13 +182,79 @@ spdk_idxd_configure_chan(struct spdk_idxd_io_channel *chan) SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); if (chan->ring_ctrl.completions == NULL) { SPDK_ERRLOG("Failed to allocate completion memory\n"); - spdk_bit_array_free(&chan->ring_ctrl.ring_slots); - spdk_free(chan->ring_ctrl.data_desc); - return -ENOMEM; + rc = -ENOMEM; + goto err_comp; + } + + chan->ring_ctrl.user_desc = spdk_zmalloc(TOTAL_USER_DESC * sizeof(struct idxd_hw_desc), + 0x40, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (chan->ring_ctrl.user_desc == NULL) { + SPDK_ERRLOG("Failed to allocate batch descriptor memory\n"); + rc = -ENOMEM; + goto err_user_desc; + } + + /* Each slot on the ring reserves DESC_PER_BATCH elemnts in user_desc. */ + chan->ring_ctrl.user_ring_slots = spdk_bit_array_create(NUM_BATCHES); + if (chan->ring_ctrl.user_ring_slots == NULL) { + SPDK_ERRLOG("Failed to allocate bit array for user ring\n"); + rc = -ENOMEM; + goto err_user_ring; + } + + chan->ring_ctrl.user_completions = spdk_zmalloc(TOTAL_USER_DESC * sizeof(struct idxd_comp), + 0x40, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (chan->ring_ctrl.user_completions == NULL) { + SPDK_ERRLOG("Failed to allocate user completion memory\n"); + rc = -ENOMEM; + goto err_user_comp; } chan->ring_ctrl.portal = (char *)chan->idxd->portals + chan->idxd->wq_id * PORTAL_SIZE; + return 0; + +err_user_comp: + spdk_bit_array_free(&chan->ring_ctrl.user_ring_slots); +err_user_ring: + spdk_free(chan->ring_ctrl.user_desc); +err_user_desc: + spdk_free(chan->ring_ctrl.completions); +err_comp: + spdk_free(chan->ring_ctrl.desc); +err_desc: + spdk_bit_array_free(&chan->ring_ctrl.ring_slots); + + return rc; +} + +/* Used for control commands, not for descriptor submission. */ +static int +idxd_wait_cmd(struct spdk_idxd_device *idxd, int _timeout) +{ + uint32_t timeout = _timeout; + union idxd_cmdsts_reg cmd_status = {}; + + cmd_status.raw = _idxd_read_4(idxd, IDXD_CMDSTS_OFFSET); + while (cmd_status.active && --timeout) { + usleep(1); + cmd_status.raw = _idxd_read_4(idxd, IDXD_CMDSTS_OFFSET); + } + + /* Check for timeout */ + if (timeout == 0 && cmd_status.active) { + SPDK_ERRLOG("Command timeout, waited %u\n", _timeout); + return -EBUSY; + } + + /* Check for error */ + if (cmd_status.err) { + SPDK_ERRLOG("Command status reg reports error 0x%x\n", cmd_status.err); + return -EINVAL; + } + return 0; } @@ -178,10 +264,6 @@ _idxd_drain(struct spdk_idxd_io_channel *chan) uint32_t index; int set = 0; - /* - * TODO this is a temp solution to drain until getting the drain cmd to work, this - * provides equivalent functionality but just doesn't use the device to do it. - */ do { spdk_idxd_process_events(chan); set = 0; @@ -196,6 +278,7 @@ spdk_idxd_reconfigure_chan(struct spdk_idxd_io_channel *chan, uint32_t num_chann { uint32_t num_ring_slots; int rc; + struct idxd_batch *batch; _idxd_drain(chan); @@ -203,8 +286,15 @@ spdk_idxd_reconfigure_chan(struct spdk_idxd_io_channel *chan, uint32_t num_chann if (num_channels == 0) { spdk_free(chan->ring_ctrl.completions); - spdk_free(chan->ring_ctrl.data_desc); + spdk_free(chan->ring_ctrl.desc); spdk_bit_array_free(&chan->ring_ctrl.ring_slots); + spdk_free(chan->ring_ctrl.user_completions); + spdk_free(chan->ring_ctrl.user_desc); + spdk_bit_array_free(&chan->ring_ctrl.user_ring_slots); + while ((batch = TAILQ_FIRST(&chan->batch_pool))) { + TAILQ_REMOVE(&chan->batch_pool, batch, link); + free(batch); + } return 0; } @@ -219,6 +309,12 @@ spdk_idxd_reconfigure_chan(struct spdk_idxd_io_channel *chan, uint32_t num_chann chan->ring_ctrl.max_ring_slots = num_ring_slots; + /* + * Note: The batch descriptor ring does not change with the + * number of channels as descriptors on this ring do not + * "count" for flow control. + */ + return rc; } @@ -286,34 +382,6 @@ idxd_map_pci_bars(struct spdk_idxd_device *idxd) return 0; } -/* Used for control commands, not for descriptor submission. */ -static int -idxd_wait_cmd(struct spdk_idxd_device *idxd, int _timeout) -{ - uint32_t timeout = _timeout; - union idxd_cmdsts_reg cmd_status = {}; - - cmd_status.raw = _idxd_read_4(idxd, IDXD_CMDSTS_OFFSET); - while (cmd_status.active && --timeout) { - usleep(1); - cmd_status.raw = _idxd_read_4(idxd, IDXD_CMDSTS_OFFSET); - } - - /* Check for timeout */ - if (timeout == 0 && cmd_status.active) { - SPDK_ERRLOG("Command timeout, waited %u\n", _timeout); - return -EBUSY; - } - - /* Check for error */ - if (cmd_status.err) { - SPDK_ERRLOG("Command status reg reports error 0x%x\n", cmd_status.err); - return -EINVAL; - } - - return 0; -} - static int idxd_reset_dev(struct spdk_idxd_device *idxd) { @@ -639,8 +707,8 @@ spdk_idxd_detach(struct spdk_idxd_device *idxd) } static struct idxd_hw_desc * -_idxd_prep_command(struct spdk_idxd_io_channel *chan, - spdk_idxd_req_cb cb_fn, void *cb_arg) +_idxd_prep_command(struct spdk_idxd_io_channel *chan, spdk_idxd_req_cb cb_fn, + void *cb_arg, struct idxd_batch *batch) { uint32_t index; struct idxd_hw_desc *desc; @@ -654,26 +722,29 @@ _idxd_prep_command(struct spdk_idxd_io_channel *chan, spdk_bit_array_set(chan->ring_ctrl.ring_slots, index); - desc = &chan->ring_ctrl.data_desc[index]; + desc = &chan->ring_ctrl.desc[index]; comp = &chan->ring_ctrl.completions[index]; desc->flags = IDXD_FLAG_COMPLETION_ADDR_VALID | IDXD_FLAG_REQUEST_COMPLETION; desc->completion_addr = (uintptr_t)&comp->hw; comp->cb_arg = cb_arg; comp->cb_fn = cb_fn; + if (batch) { + comp->batch = batch; + batch->batch_desc_index = index; + } return desc; } int spdk_idxd_submit_copy(struct spdk_idxd_io_channel *chan, void *dst, const void *src, - uint64_t nbytes, - spdk_idxd_req_cb cb_fn, void *cb_arg) + uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg) { struct idxd_hw_desc *desc; /* Common prep. */ - desc = _idxd_prep_command(chan, cb_fn, cb_arg); + desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL); if (desc == NULL) { return -EBUSY; } @@ -703,7 +774,7 @@ spdk_idxd_submit_dualcast(struct spdk_idxd_io_channel *chan, void *dst1, void *d } /* Common prep. */ - desc = _idxd_prep_command(chan, cb_fn, cb_arg); + desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL); if (desc == NULL) { return -EBUSY; } @@ -729,7 +800,7 @@ spdk_idxd_submit_compare(struct spdk_idxd_io_channel *chan, void *src1, const vo struct idxd_hw_desc *desc; /* Common prep. */ - desc = _idxd_prep_command(chan, cb_fn, cb_arg); + desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL); if (desc == NULL) { return -EBUSY; } @@ -748,13 +819,12 @@ spdk_idxd_submit_compare(struct spdk_idxd_io_channel *chan, void *src1, const vo int spdk_idxd_submit_fill(struct spdk_idxd_io_channel *chan, void *dst, uint64_t fill_pattern, - uint64_t nbytes, - spdk_idxd_req_cb cb_fn, void *cb_arg) + uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg) { struct idxd_hw_desc *desc; /* Common prep. */ - desc = _idxd_prep_command(chan, cb_fn, cb_arg); + desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL); if (desc == NULL) { return -EBUSY; } @@ -779,7 +849,7 @@ spdk_idxd_submit_crc32c(struct spdk_idxd_io_channel *chan, uint32_t *dst, void * struct idxd_hw_desc *desc; /* Common prep. */ - desc = _idxd_prep_command(chan, cb_fn, cb_arg); + desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL); if (desc == NULL) { return -EBUSY; } @@ -798,6 +868,270 @@ spdk_idxd_submit_crc32c(struct spdk_idxd_io_channel *chan, uint32_t *dst, void * return 0; } +uint32_t +spdk_idxd_batch_get_max(void) +{ + return DESC_PER_BATCH; /* TODO maybe add startup RPC to set this */ +} + +struct idxd_batch * +spdk_idxd_batch_create(struct spdk_idxd_io_channel *chan) +{ + struct idxd_batch *batch = NULL; + + if (!TAILQ_EMPTY(&chan->batch_pool)) { + batch = TAILQ_FIRST(&chan->batch_pool); + TAILQ_REMOVE(&chan->batch_pool, batch, link); + } else { + /* The application needs to handle this. */ + return NULL; + } + + batch->batch_num = spdk_bit_array_find_first_clear(chan->ring_ctrl.user_ring_slots, 0); + if (batch->batch_num == UINT32_MAX) { + /* ran out of ring slots, the application needs to handle this. */ + TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link); + return NULL; + } + + spdk_bit_array_set(chan->ring_ctrl.user_ring_slots, batch->batch_num); + + /* + * Find the first descriptor address for the given batch. The + * descriptor ring used for user desctipors is allocated in + * units of DESC_PER_BATCH. The actual index is in units of + * one descriptor. + */ + batch->start_index = batch->cur_index = batch->batch_num * DESC_PER_BATCH; + + TAILQ_INSERT_TAIL(&chan->batches, batch, link); + SPDK_DEBUGLOG(SPDK_LOG_IDXD, "New batch %p num %u\n", batch, batch->batch_num); + + return batch; +} + +static bool +_does_batch_exist(struct idxd_batch *batch, struct spdk_idxd_io_channel *chan) +{ + bool found = false; + struct idxd_batch *cur_batch; + + TAILQ_FOREACH(cur_batch, &chan->batches, link) { + if (cur_batch == batch) { + found = true; + break; + } + } + + return found; +} + +int +spdk_idxd_batch_cancel(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch) +{ + if (_does_batch_exist(batch, chan) == false) { + SPDK_ERRLOG("Attempt to cancel a batch that doesn't exist\n."); + return -EINVAL; + } + + if (batch->remaining > 0) { + SPDK_ERRLOG("Cannot cancel batch, already submitted to HW\n."); + return -EINVAL; + } + + TAILQ_REMOVE(&chan->batches, batch, link); + spdk_bit_array_clear(chan->ring_ctrl.user_ring_slots, batch->batch_num); + TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link); + + return 0; +} + +int +spdk_idxd_batch_submit(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + if (_does_batch_exist(batch, chan) == false) { + SPDK_ERRLOG("Attempt to submit a batch that doesn't exist\n."); + return -EINVAL; + } + + /* Common prep. */ + desc = _idxd_prep_command(chan, cb_fn, cb_arg, batch); + if (desc == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_IDXD, "Can't submit batch %p busy batch num %u\n", batch, batch->batch_num); + return -EBUSY; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_BATCH; + desc->desc_list_addr = (uintptr_t)&chan->ring_ctrl.user_desc[batch->start_index]; + desc->desc_count = batch->cur_index - batch->start_index; + assert(desc->desc_count <= DESC_PER_BATCH); + + if (desc->desc_count < MIN_USER_DESC_COUNT) { + SPDK_ERRLOG("Attempt to submit a batch without at least %u operations.\n", + MIN_USER_DESC_COUNT); + return -EINVAL; + } + + /* Total completions for the batch = num desc plus 1 for the batch desc itself. */ + batch->remaining = desc->desc_count + 1; + + /* Submit operation. */ + movdir64b(chan->ring_ctrl.portal, desc); + + return 0; +} + +static struct idxd_hw_desc * +_idxd_prep_batch_cmd(struct spdk_idxd_io_channel *chan, spdk_idxd_req_cb cb_fn, + void *cb_arg, struct idxd_batch *batch) +{ + struct idxd_hw_desc *desc; + struct idxd_comp *comp; + + if (_does_batch_exist(batch, chan) == false) { + SPDK_ERRLOG("Attempt to add to a batch that doesn't exist\n."); + return NULL; + } + + if ((batch->cur_index - batch->start_index) == DESC_PER_BATCH) { + SPDK_ERRLOG("Attempt to add to a batch that is already full\n."); + return NULL; + } + + desc = &chan->ring_ctrl.user_desc[batch->cur_index]; + comp = &chan->ring_ctrl.user_completions[batch->cur_index]; + SPDK_DEBUGLOG(SPDK_LOG_IDXD, "Prep batch %p index %u\n", batch, batch->cur_index); + + batch->cur_index++; + assert(batch->cur_index > batch->start_index); + + desc->flags = IDXD_FLAG_COMPLETION_ADDR_VALID | IDXD_FLAG_REQUEST_COMPLETION; + desc->completion_addr = (uintptr_t)&comp->hw; + comp->cb_arg = cb_arg; + comp->cb_fn = cb_fn; + comp->batch = batch; + + return desc; +} + +int +spdk_idxd_batch_prep_copy(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + void *dst, const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + /* Common prep. */ + desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch); + if (desc == NULL) { + return -EINVAL; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_MEMMOVE; + desc->src_addr = (uintptr_t)src; + desc->dst_addr = (uintptr_t)dst; + desc->xfer_size = nbytes; + + return 0; +} + +int +spdk_idxd_batch_prep_fill(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + void *dst, uint64_t fill_pattern, uint64_t nbytes, + spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + /* Common prep. */ + desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch); + if (desc == NULL) { + return -EINVAL; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_MEMFILL; + desc->pattern = fill_pattern; + desc->dst_addr = (uintptr_t)dst; + desc->xfer_size = nbytes; + + return 0; +} + +int +spdk_idxd_batch_prep_dualcast(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + void *dst1, void *dst2, const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) { + SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n"); + return -EINVAL; + } + + /* Common prep. */ + desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch); + if (desc == NULL) { + return -EINVAL; + } + desc->opcode = IDXD_OPCODE_DUALCAST; + desc->src_addr = (uintptr_t)src; + desc->dst_addr = (uintptr_t)dst1; + desc->dest2 = (uintptr_t)dst2; + desc->xfer_size = nbytes; + + return 0; +} + +int +spdk_idxd_batch_prep_crc32c(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes, + spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + /* Common prep. */ + desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch); + if (desc == NULL) { + return -EINVAL; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_CRC32C_GEN; + desc->dst_addr = (uintptr_t)dst; + desc->src_addr = (uintptr_t)src; + desc->flags &= IDXD_CLEAR_CRC_FLAGS; + desc->crc32c.seed = seed; + desc->xfer_size = nbytes; + + return 0; +} + +int +spdk_idxd_batch_prep_compare(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + void *src1, void *src2, uint64_t nbytes, spdk_idxd_req_cb cb_fn, + void *cb_arg) +{ + struct idxd_hw_desc *desc; + + /* Common prep. */ + desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch); + if (desc == NULL) { + return -EINVAL; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_COMPARE; + desc->src_addr = (uintptr_t)src1; + desc->src2_addr = (uintptr_t)src2; + desc->xfer_size = nbytes; + + return 0; +} + static void _dump_error_reg(struct spdk_idxd_io_channel *chan) { @@ -817,6 +1151,79 @@ _dump_error_reg(struct spdk_idxd_io_channel *chan) SPDK_NOTICELOG("SW Error Operation: %u\n", (uint8_t)(sw_error_0 >> 32)); } +static void +_free_batch(struct idxd_batch *batch, struct spdk_idxd_io_channel *chan, + struct idxd_comp *comp) +{ + TAILQ_REMOVE(&chan->batches, batch, link); + TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link); + comp->batch = NULL; + spdk_bit_array_clear(chan->ring_ctrl.user_ring_slots, batch->batch_num); + spdk_bit_array_clear(chan->ring_ctrl.ring_slots, batch->batch_desc_index); +} + +static void +_spdk_idxd_process_batch_events(struct spdk_idxd_io_channel *chan) +{ + uint16_t index; + struct idxd_comp *comp; + uint64_t sw_error_0; + int status = 0; + struct idxd_batch *batch; + + /* + * We don't check the bit array for user completions as there's only + * one bit per per batch. + */ + for (index = 0; index < TOTAL_USER_DESC; index++) { + comp = &chan->ring_ctrl.user_completions[index]; + if (comp->hw.status == 1) { + struct idxd_hw_desc *desc; + + sw_error_0 = _idxd_read_8(chan->idxd, IDXD_SWERR_OFFSET); + if (sw_error_0 & 0x1) { + _dump_error_reg(chan); + status = -EINVAL; + } + + desc = &chan->ring_ctrl.user_desc[index]; + switch (desc->opcode) { + case IDXD_OPCODE_CRC32C_GEN: + *(uint32_t *)desc->dst_addr = comp->hw.crc32c_val; + *(uint32_t *)desc->dst_addr ^= ~0; + break; + case IDXD_OPCODE_COMPARE: + if (status == 0) { + status = comp->hw.result; + } + break; + case IDXD_OPCODE_MEMFILL: + case IDXD_OPCODE_DUALCAST: + case IDXD_OPCODE_MEMMOVE: + break; + default: + assert(false); + break; + } + + /* The hw will complete all user desc first before the batch + * desc (see spec for configuration exceptions) however + * because of the order that we check for comps in the poller + * we may "see" them in a different order than they actually + * completed in. + */ + batch = comp->batch; + assert(batch->remaining > 0); + if (--batch->remaining == 0) { + _free_batch(batch, chan, comp); + } + + comp->cb_fn((void *)comp->cb_arg, status); + comp->hw.status = status = 0; + } + } +} + /* * TODO: Experiment with different methods of reaping completions for performance * once we have real silicon. @@ -828,6 +1235,11 @@ spdk_idxd_process_events(struct spdk_idxd_io_channel *chan) struct idxd_comp *comp; uint64_t sw_error_0; int status = 0; + struct idxd_batch *batch; + + if (!TAILQ_EMPTY(&chan->batches)) { + _spdk_idxd_process_batch_events(chan); + } for (index = 0; index < chan->ring_ctrl.max_ring_slots; index++) { if (spdk_bit_array_get(chan->ring_ctrl.ring_slots, index)) { @@ -841,8 +1253,21 @@ spdk_idxd_process_events(struct spdk_idxd_io_channel *chan) status = -EINVAL; } - desc = &chan->ring_ctrl.data_desc[index]; + desc = &chan->ring_ctrl.desc[index]; switch (desc->opcode) { + case IDXD_OPCODE_BATCH: + /* The hw will complete all user desc first before the batch + * desc (see spec for configuration exceptions) however + * because of the order that we check for comps in the poller + * we may "see" them in a different order than they actually + * completed in. + */ + batch = comp->batch; + assert(batch->remaining > 0); + if (--batch->remaining == 0) { + _free_batch(batch, chan, comp); + } + break; case IDXD_OPCODE_CRC32C_GEN: *(uint32_t *)desc->dst_addr = comp->hw.crc32c_val; *(uint32_t *)desc->dst_addr ^= ~0; @@ -856,7 +1281,9 @@ spdk_idxd_process_events(struct spdk_idxd_io_channel *chan) comp->cb_fn(comp->cb_arg, status); comp->hw.status = status = 0; - spdk_bit_array_clear(chan->ring_ctrl.ring_slots, index); + if (desc->opcode != IDXD_OPCODE_BATCH) { + spdk_bit_array_clear(chan->ring_ctrl.ring_slots, index); + } } } } diff --git a/lib/idxd/idxd.h b/lib/idxd/idxd.h index 4664b5c56e9..09d021152d7 100644 --- a/lib/idxd/idxd.h +++ b/lib/idxd/idxd.h @@ -57,6 +57,7 @@ static inline void movdir64b(void *dst, const void *src) } #define IDXD_REGISTER_TIMEOUT_US 50 +#define IDXD_DRAIN_TIMEOUT_US 500000 /* TODO: make some of these RPC selectable */ #define WQ_MODE_DEDICATED 1 @@ -66,6 +67,19 @@ static inline void movdir64b(void *dst, const void *src) #define WQ_PRIORITY_1 1 #define IDXD_MAX_QUEUES 64 +#define TOTAL_USER_DESC (1 << LOG2_WQ_MAX_BATCH) +#define DESC_PER_BATCH 16 /* TODO maybe make this a startup RPC */ +#define NUM_BATCHES (TOTAL_USER_DESC / DESC_PER_BATCH) +#define MIN_USER_DESC_COUNT 2 + +struct idxd_batch { + uint32_t batch_desc_index; + uint32_t batch_num; + uint32_t cur_index; + uint32_t start_index; + uint32_t remaining; + TAILQ_ENTRY(idxd_batch) link; +}; struct device_config { uint8_t config_num; @@ -83,24 +97,33 @@ struct idxd_ring_control { /* * Rings for this channel, one for descriptors and one - * for completions, share the same index. Future will - * include a separate ring for batch descriptors once - * the batch interface is completed. + * for completions, share the same index. Batch descriptors + * are managed independently from data descriptors. */ - struct idxd_hw_desc *data_desc; + struct idxd_hw_desc *desc; struct idxd_comp *completions; + struct idxd_hw_desc *user_desc; + struct idxd_comp *user_completions; /* * We use one bit array to track ring slots for both - * data_desc and completions. + * desc and completions. */ struct spdk_bit_array *ring_slots; uint32_t max_ring_slots; + + /* + * We use a separate bit array to track ring slots for + * descriptors submitted via the user in a batch. + */ + struct spdk_bit_array *user_ring_slots; }; struct spdk_idxd_io_channel { struct spdk_idxd_device *idxd; struct idxd_ring_control ring_ctrl; + TAILQ_HEAD(, idxd_batch) batch_pool; /* free batches */ + TAILQ_HEAD(, idxd_batch) batches; /* in use batches */ }; struct pci_dev_id { @@ -130,7 +153,7 @@ struct idxd_comp { struct idxd_hw_comp_record hw; void *cb_arg; spdk_idxd_req_cb cb_fn; - uint64_t pad1; + struct idxd_batch *batch; uint64_t pad2; } __attribute__((packed)); SPDK_STATIC_ASSERT(sizeof(struct idxd_comp) == 64, "size mismatch"); diff --git a/lib/idxd/spdk_idxd.map b/lib/idxd/spdk_idxd.map index 4abce91ec05..4bffdf209c3 100644 --- a/lib/idxd/spdk_idxd.map +++ b/lib/idxd/spdk_idxd.map @@ -6,6 +6,15 @@ spdk_idxd_reconfigure_chan; spdk_idxd_probe; spdk_idxd_detach; + spdk_idxd_batch_prep_copy; + spdk_idxd_batch_prep_dualcast; + spdk_idxd_batch_prep_fill; + spdk_idxd_batch_prep_crc32c; + spdk_idxd_batch_prep_compare; + spdk_idxd_batch_submit; + spdk_idxd_batch_create; + spdk_idxd_batch_cancel; + spdk_idxd_batch_get_max; spdk_idxd_set_config; spdk_idxd_submit_compare; spdk_idxd_submit_crc32c; diff --git a/lib/ioat/Makefile b/lib/ioat/Makefile index a0da25b86b6..4cada5685e5 100644 --- a/lib/ioat/Makefile +++ b/lib/ioat/Makefile @@ -34,7 +34,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk -SO_VER := 2 +SO_VER := 3 SO_MINOR := 0 C_SRCS = ioat.c diff --git a/lib/ioat/ioat.c b/lib/ioat/ioat.c index ccc16ce11a3..516fa545cfd 100644 --- a/lib/ioat/ioat.c +++ b/lib/ioat/ioat.c @@ -318,7 +318,7 @@ static int ioat_process_channel_events(struct spdk_ioat_chan *ioat) { struct ioat_descriptor *desc; - uint64_t status, completed_descriptor, hw_desc_phys_addr; + uint64_t status, completed_descriptor, hw_desc_phys_addr, events_count = 0; uint32_t tail; if (ioat->head == ioat->tail) { @@ -347,10 +347,12 @@ ioat_process_channel_events(struct spdk_ioat_chan *ioat) hw_desc_phys_addr = desc->phys_addr; ioat->tail++; + events_count++; } while (hw_desc_phys_addr != completed_descriptor); ioat->last_seen = hw_desc_phys_addr; - return 0; + + return events_count; } static void @@ -372,6 +374,12 @@ ioat_channel_destruct(struct spdk_ioat_chan *ioat) } } +uint32_t +spdk_ioat_get_max_descriptors(struct spdk_ioat_chan *ioat) +{ + return 1 << ioat->ring_size_order; +} + static int ioat_channel_start(struct spdk_ioat_chan *ioat) { diff --git a/lib/ioat/spdk_ioat.map b/lib/ioat/spdk_ioat.map index a74768c9ee3..f467da8172b 100644 --- a/lib/ioat/spdk_ioat.map +++ b/lib/ioat/spdk_ioat.map @@ -11,6 +11,7 @@ spdk_ioat_flush; spdk_ioat_process_events; spdk_ioat_get_dma_capabilities; + spdk_ioat_get_max_descriptors; local: *; }; diff --git a/lib/iscsi/conn.c b/lib/iscsi/conn.c index 4d9bc4cd1de..5038afb57e9 100644 --- a/lib/iscsi/conn.c +++ b/lib/iscsi/conn.c @@ -66,6 +66,9 @@ struct spdk_iscsi_conn *g_conns_array = MAP_FAILED; static int g_conns_array_fd = -1; static char g_shm_name[64]; +static TAILQ_HEAD(, spdk_iscsi_conn) g_free_conns = TAILQ_HEAD_INITIALIZER(g_free_conns); +static TAILQ_HEAD(, spdk_iscsi_conn) g_active_conns = TAILQ_HEAD_INITIALIZER(g_active_conns); + static pthread_mutex_t g_conns_mutex = PTHREAD_MUTEX_INITIALIZER; static struct spdk_poller *g_shutdown_timer = NULL; @@ -77,39 +80,40 @@ static struct spdk_iscsi_conn * allocate_conn(void) { struct spdk_iscsi_conn *conn; - int i; pthread_mutex_lock(&g_conns_mutex); - for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) { - conn = &g_conns_array[i]; - if (!conn->is_valid) { - SPDK_ISCSI_CONNECTION_MEMSET(conn); - conn->is_valid = 1; - pthread_mutex_unlock(&g_conns_mutex); - return conn; - } + conn = TAILQ_FIRST(&g_free_conns); + if (conn != NULL) { + assert(!conn->is_valid); + TAILQ_REMOVE(&g_free_conns, conn, conn_link); + SPDK_ISCSI_CONNECTION_MEMSET(conn); + conn->is_valid = 1; + + TAILQ_INSERT_TAIL(&g_active_conns, conn, conn_link); } pthread_mutex_unlock(&g_conns_mutex); - return NULL; + return conn; } static void -free_conn(struct spdk_iscsi_conn *conn) +_free_conn(struct spdk_iscsi_conn *conn) { + TAILQ_REMOVE(&g_active_conns, conn, conn_link); + memset(conn->portal_host, 0, sizeof(conn->portal_host)); memset(conn->portal_port, 0, sizeof(conn->portal_port)); conn->is_valid = 0; + + TAILQ_INSERT_TAIL(&g_free_conns, conn, conn_link); } -static struct spdk_iscsi_conn * -find_iscsi_connection_by_id(int cid) +static void +free_conn(struct spdk_iscsi_conn *conn) { - if (g_conns_array != MAP_FAILED && g_conns_array[cid].is_valid == 1) { - return &g_conns_array[cid]; - } else { - return NULL; - } + pthread_mutex_lock(&g_conns_mutex); + _free_conn(conn); + pthread_mutex_unlock(&g_conns_mutex); } static void @@ -158,6 +162,7 @@ int initialize_iscsi_conns(void) for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) { g_conns_array[i].id = i; + TAILQ_INSERT_TAIL(&g_free_conns, &g_conns_array[i], conn_link); } return 0; @@ -180,7 +185,7 @@ iscsi_poll_group_add_conn(struct spdk_iscsi_poll_group *pg, struct spdk_iscsi_co } conn->is_stopped = false; - STAILQ_INSERT_TAIL(&pg->connections, conn, link); + STAILQ_INSERT_TAIL(&pg->connections, conn, pg_link); } static void @@ -195,7 +200,7 @@ iscsi_poll_group_remove_conn(struct spdk_iscsi_poll_group *pg, struct spdk_iscsi } conn->is_stopped = true; - STAILQ_REMOVE(&pg->connections, conn, spdk_iscsi_conn, link); + STAILQ_REMOVE(&pg->connections, conn, spdk_iscsi_conn, pg_link); } static void @@ -224,6 +229,7 @@ iscsi_conn_construct(struct spdk_iscsi_portal *portal, conn->timeout = g_iscsi.timeout * spdk_get_ticks_hz(); /* seconds to TSC */ conn->nopininterval = g_iscsi.nopininterval; conn->nopininterval *= spdk_get_ticks_hz(); /* seconds to TSC */ + conn->last_nopin = spdk_get_ticks(); conn->nop_outstanding = false; conn->data_out_cnt = 0; conn->data_in_cnt = 0; @@ -254,10 +260,6 @@ iscsi_conn_construct(struct spdk_iscsi_portal *portal, conn->sess_param_state_negotiated[i] = false; } - for (i = 0; i < DEFAULT_MAXR2T; i++) { - conn->outstanding_r2t_tasks[i] = NULL; - } - conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_AWAIT_PDU_READY; TAILQ_INIT(&conn->write_pdu_list); @@ -435,7 +437,7 @@ iscsi_conn_free(struct spdk_iscsi_conn *conn) end: SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "cleanup free conn\n"); iscsi_param_free(conn->params); - free_conn(conn); + _free_conn(conn); pthread_mutex_unlock(&g_conns_mutex); } @@ -512,10 +514,10 @@ iscsi_conn_remove_lun(void *ctx) int lun_id = spdk_scsi_lun_get_id(lun); if (!iscsi_conn_check_tasks_for_lun(conn, lun)) { - return -1; + return SPDK_POLLER_BUSY; } iscsi_conn_close_lun(conn, lun_id); - return -1; + return SPDK_POLLER_BUSY; } static void @@ -648,7 +650,7 @@ _iscsi_conn_check_shutdown(void *arg) rc = iscsi_conn_free_tasks(conn); if (rc < 0) { - return 1; + return SPDK_POLLER_BUSY; } spdk_poller_unregister(&conn->shutdown_timer); @@ -656,7 +658,7 @@ _iscsi_conn_check_shutdown(void *arg) iscsi_conn_stop(conn); iscsi_conn_free(conn); - return 1; + return SPDK_POLLER_BUSY; } static void @@ -664,10 +666,9 @@ _iscsi_conn_destruct(struct spdk_iscsi_conn *conn) { int rc; - iscsi_clear_all_transfer_task(conn, NULL, NULL); - iscsi_poll_group_remove_conn(conn->pg, conn); spdk_sock_close(&conn->sock); + iscsi_clear_all_transfer_task(conn, NULL, NULL); spdk_poller_unregister(&conn->logout_request_timer); spdk_poller_unregister(&conn->logout_timer); @@ -688,14 +689,14 @@ _iscsi_conn_check_pending_tasks(void *arg) if (conn->dev != NULL && spdk_scsi_dev_has_pending_tasks(conn->dev, conn->initiator_port)) { - return 1; + return SPDK_POLLER_BUSY; } spdk_poller_unregister(&conn->shutdown_timer); _iscsi_conn_destruct(conn); - return 1; + return SPDK_POLLER_BUSY; } void @@ -755,18 +756,16 @@ iscsi_get_active_conns(struct spdk_iscsi_tgt_node *target) { struct spdk_iscsi_conn *conn; int num = 0; - int i; + + if (g_conns_array == MAP_FAILED) { + return 0; + } pthread_mutex_lock(&g_conns_mutex); - for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) { - conn = find_iscsi_connection_by_id(i); - if (conn == NULL) { - continue; - } - if (target != NULL && conn->target != target) { - continue; + TAILQ_FOREACH(conn, &g_active_conns, conn_link) { + if (target == NULL || conn->target == target) { + num++; } - num++; } pthread_mutex_unlock(&g_conns_mutex); return num; @@ -783,14 +782,14 @@ static int iscsi_conn_check_shutdown(void *arg) { if (iscsi_get_active_conns(NULL) != 0) { - return 1; + return SPDK_POLLER_BUSY; } spdk_poller_unregister(&g_shutdown_timer); spdk_thread_send_msg(spdk_get_thread(), iscsi_conn_check_shutdown_cb, NULL); - return 1; + return SPDK_POLLER_BUSY; } static void @@ -811,6 +810,7 @@ iscsi_send_logout_request(struct spdk_iscsi_conn *conn) to_be16(&rsph->param3, ISCSI_LOGOUT_REQUEST_TIMEOUT); to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); @@ -826,7 +826,7 @@ logout_request_timeout(void *arg) conn->state = ISCSI_CONN_STATE_EXITING; } - return -1; + return SPDK_POLLER_BUSY; } /* If the connection is running and logout is not requested yet, request logout @@ -864,33 +864,28 @@ iscsi_conn_request_logout(struct spdk_iscsi_conn *conn) } void -iscsi_conns_request_logout(struct spdk_iscsi_tgt_node *target) +iscsi_conns_request_logout(struct spdk_iscsi_tgt_node *target, int pg_tag) { struct spdk_iscsi_conn *conn; - int i; - pthread_mutex_lock(&g_conns_mutex); - - for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) { - conn = find_iscsi_connection_by_id(i); - if (conn == NULL) { - continue; - } + if (g_conns_array == MAP_FAILED) { + return; + } - if (target != NULL && conn->target != target) { - continue; + pthread_mutex_lock(&g_conns_mutex); + TAILQ_FOREACH(conn, &g_active_conns, conn_link) { + if ((target == NULL) || + (conn->target == target && (pg_tag < 0 || conn->pg_tag == pg_tag))) { + iscsi_conn_request_logout(conn); } - - iscsi_conn_request_logout(conn); } - pthread_mutex_unlock(&g_conns_mutex); } void shutdown_iscsi_conns(void) { - iscsi_conns_request_logout(NULL); + iscsi_conns_request_logout(NULL, -1); g_shutdown_timer = SPDK_POLLER_REGISTER(iscsi_conn_check_shutdown, NULL, 1000); } @@ -915,19 +910,17 @@ iscsi_drop_conns(struct spdk_iscsi_conn *conn, const char *conn_match, struct spdk_iscsi_conn *xconn; const char *xconn_match; struct spdk_thread *thread; - int i, num; + int num; SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_drop_conns\n"); num = 0; pthread_mutex_lock(&g_conns_mutex); - for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) { - xconn = find_iscsi_connection_by_id(i); - - if (xconn == NULL) { - continue; - } + if (g_conns_array == MAP_FAILED) { + goto exit; + } + TAILQ_FOREACH(xconn, &g_active_conns, conn_link) { if (xconn == conn) { continue; } @@ -968,6 +961,7 @@ iscsi_drop_conns(struct spdk_iscsi_conn *conn, const char *conn_match, } } +exit: pthread_mutex_unlock(&g_conns_mutex); if (num != 0) { @@ -984,7 +978,7 @@ _iscsi_conn_abort_queued_datain_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *subtask; uint32_t remaining_size; - if (conn->data_in_cnt >= MAX_LARGE_DATAIN_PER_CONNECTION) { + if (conn->data_in_cnt >= g_iscsi.MaxLargeDataInPerConnection) { return -1; } @@ -1056,7 +1050,7 @@ iscsi_conn_handle_queued_datain_tasks(struct spdk_iscsi_conn *conn) struct spdk_iscsi_task *task; while (!TAILQ_EMPTY(&conn->queued_datain_tasks) && - conn->data_in_cnt < MAX_LARGE_DATAIN_PER_CONNECTION) { + conn->data_in_cnt < g_iscsi.MaxLargeDataInPerConnection) { task = TAILQ_FIRST(&conn->queued_datain_tasks); assert(task->current_datain_offset <= task->scsi.transfer_len); if (task->current_datain_offset < task->scsi.transfer_len) { @@ -1230,14 +1224,11 @@ process_non_read_task_completion(struct spdk_iscsi_conn *conn, * iscsi_clear_all_transfer_task() in iscsi.c.) */ if (primary->is_r2t_active) { - iscsi_del_transfer_task(conn, primary->tag); if (primary->rsp_scsi_status != SPDK_SCSI_STATUS_GOOD) { iscsi_task_copy_from_rsp_scsi_status(&primary->scsi, primary); } iscsi_task_response(conn, primary); - TAILQ_REMOVE(&conn->active_r2t_tasks, primary, link); - primary->is_r2t_active = false; - iscsi_task_put(primary); + iscsi_del_transfer_task(conn, primary->tag); } } else { iscsi_task_response(conn, task); @@ -1574,6 +1565,12 @@ iscsi_conn_full_feature_migrate(void *arg) { struct spdk_iscsi_conn *conn = arg; + if (conn->state >= ISCSI_CONN_STATE_EXITING) { + /* Connection is being exited before this callback is executed. */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Connection is already exited.\n"); + return; + } + if (conn->sess->session_type == SESSION_TYPE_NORMAL) { iscsi_conn_open_luns(conn); } @@ -1631,7 +1628,6 @@ iscsi_conn_schedule(struct spdk_iscsi_conn *conn) /* Remove this connection from the previous poll group */ iscsi_poll_group_remove_conn(conn->pg, conn); - conn->last_nopin = spdk_get_ticks(); conn->pg = pg; spdk_thread_send_msg(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(pg)), @@ -1647,7 +1643,7 @@ logout_timeout(void *arg) conn->state = ISCSI_CONN_STATE_EXITING; } - return -1; + return SPDK_POLLER_BUSY; } void diff --git a/lib/iscsi/conn.h b/lib/iscsi/conn.h index dd18bd4b356..c1ccb0ffc17 100644 --- a/lib/iscsi/conn.h +++ b/lib/iscsi/conn.h @@ -137,8 +137,7 @@ struct spdk_iscsi_conn { TAILQ_HEAD(, spdk_iscsi_pdu) write_pdu_list; TAILQ_HEAD(, spdk_iscsi_pdu) snack_pdu_list; - int pending_r2t; - struct spdk_iscsi_task *outstanding_r2t_tasks[DEFAULT_MAXR2T]; + uint32_t pending_r2t; uint16_t cid; @@ -189,13 +188,15 @@ struct spdk_iscsi_conn { uint32_t ttt; /* target transfer tag */ char *partial_text_parameter; - STAILQ_ENTRY(spdk_iscsi_conn) link; + STAILQ_ENTRY(spdk_iscsi_conn) pg_link; bool is_stopped; /* Set true when connection is stopped for migration */ TAILQ_HEAD(queued_r2t_tasks, spdk_iscsi_task) queued_r2t_tasks; TAILQ_HEAD(active_r2t_tasks, spdk_iscsi_task) active_r2t_tasks; TAILQ_HEAD(queued_datain_tasks, spdk_iscsi_task) queued_datain_tasks; struct spdk_iscsi_lun *luns[SPDK_SCSI_DEV_MAX_LUN]; + + TAILQ_ENTRY(spdk_iscsi_conn) conn_link; }; extern struct spdk_iscsi_conn *g_conns_array; @@ -205,7 +206,7 @@ void iscsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task); int initialize_iscsi_conns(void); void shutdown_iscsi_conns(void); -void iscsi_conns_request_logout(struct spdk_iscsi_tgt_node *target); +void iscsi_conns_request_logout(struct spdk_iscsi_tgt_node *target, int pg_tag); int iscsi_get_active_conns(struct spdk_iscsi_tgt_node *target); int iscsi_conn_construct(struct spdk_iscsi_portal *portal, struct spdk_sock *sock); diff --git a/lib/iscsi/iscsi.c b/lib/iscsi/iscsi.c index a54617dc434..b32cc116c50 100644 --- a/lib/iscsi/iscsi.c +++ b/lib/iscsi/iscsi.c @@ -757,8 +757,7 @@ append_iscsi_sess(struct spdk_iscsi_conn *conn, } static int -iscsi_append_text(struct spdk_iscsi_conn *conn __attribute__((__unused__)), - const char *key, const char *val, uint8_t *data, +iscsi_append_text(const char *key, const char *val, uint8_t *data, int alloc_len, int data_len) { int total; @@ -789,7 +788,6 @@ iscsi_append_param(struct spdk_iscsi_conn *conn, const char *key, uint8_t *data, int alloc_len, int data_len) { struct iscsi_param *param; - int rc; param = iscsi_param_find(conn->params, key); if (param == NULL) { @@ -799,9 +797,8 @@ iscsi_append_param(struct spdk_iscsi_conn *conn, const char *key, return data_len; } } - rc = iscsi_append_text(conn, param->key, param->val, data, - alloc_len, data_len); - return rc; + return iscsi_append_text(param->key, param->val, data, + alloc_len, data_len); } static int @@ -866,21 +863,18 @@ iscsi_auth_params(struct spdk_iscsi_conn *conn, if (new_val == NULL) { snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "Reject"); new_val = in_val; - iscsi_append_text(conn, "CHAP_A", new_val, - data, alloc_len, total); + iscsi_append_text("CHAP_A", new_val, data, alloc_len, total); goto error_return; } /* selected algorithm is 5 (MD5) */ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got CHAP_A=%s\n", new_val); - total = iscsi_append_text(conn, "CHAP_A", new_val, - data, alloc_len, total); + total = iscsi_append_text("CHAP_A", new_val, data, alloc_len, total); /* Identifier is one octet */ gen_random(conn->auth.chap_id, 1); snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN, "%d", (int) conn->auth.chap_id[0]); - total = iscsi_append_text(conn, "CHAP_I", in_val, - data, alloc_len, total); + total = iscsi_append_text("CHAP_I", in_val, data, alloc_len, total); /* Challenge Value is a variable stream of octets */ /* (binary length MUST not exceed 1024 bytes) */ @@ -888,8 +882,7 @@ iscsi_auth_params(struct spdk_iscsi_conn *conn, gen_random(conn->auth.chap_challenge, conn->auth.chap_challenge_len); bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN, conn->auth.chap_challenge, conn->auth.chap_challenge_len); - total = iscsi_append_text(conn, "CHAP_C", in_val, - data, alloc_len, total); + total = iscsi_append_text("CHAP_C", in_val, data, alloc_len, total); conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_NR; } else if ((name = iscsi_param_get_val(params, "CHAP_N")) != NULL) { @@ -1031,10 +1024,9 @@ iscsi_auth_params(struct spdk_iscsi_conn *conn, bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN, tgtmd5, SPDK_MD5DIGEST_LEN); - total = iscsi_append_text(conn, "CHAP_N", - conn->auth.muser, data, alloc_len, total); - total = iscsi_append_text(conn, "CHAP_R", - in_val, data, alloc_len, total); + total = iscsi_append_text("CHAP_N", conn->auth.muser, data, + alloc_len, total); + total = iscsi_append_text("CHAP_R", in_val, data, alloc_len, total); } else { /* not mutual */ if (conn->mutual_chap) { @@ -1203,9 +1195,8 @@ iscsi_op_login_response(struct spdk_iscsi_conn *conn, /* Set T/CSG/NSG to reserved if login error. */ if (rsph->status_class != 0) { - rsph->flags &= ~ISCSI_LOGIN_TRANSIT; - rsph->flags &= ~ISCSI_LOGIN_CURRENT_STAGE_MASK; - rsph->flags &= ~ISCSI_LOGIN_NEXT_STAGE_MASK; + rsph->flags &= ~(ISCSI_LOGIN_TRANSIT | ISCSI_LOGIN_CURRENT_STAGE_MASK | + ISCSI_LOGIN_NEXT_STAGE_MASK); } iscsi_param_free(params); iscsi_conn_write_pdu(conn, rsp_pdu, cb_fn, conn); @@ -1224,7 +1215,6 @@ iscsi_op_login_rsp_init(struct spdk_iscsi_conn *conn, { struct iscsi_bhs_login_req *reqh; struct iscsi_bhs_login_rsp *rsph; - uint32_t alloc_len; rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; rsph->opcode = ISCSI_OP_LOGIN_RSP; @@ -1232,26 +1222,19 @@ iscsi_op_login_rsp_init(struct spdk_iscsi_conn *conn, rsph->status_detail = ISCSI_LOGIN_ACCEPT; rsp_pdu->data_segment_len = 0; - /* Default MaxRecvDataSegmentLength - RFC3720(12.12) */ - if (conn->MaxRecvDataSegmentLength < 8192) { - alloc_len = 8192; - } else { - alloc_len = conn->MaxRecvDataSegmentLength; - } - - rsp_pdu->data = calloc(1, alloc_len); + /* The default MaxRecvDataSegmentLength 8192 is used during login. - RFC3720 */ + rsp_pdu->data = calloc(1, 8192); if (!rsp_pdu->data) { SPDK_ERRLOG("calloc() failed for data segment\n"); rsph->status_class = ISCSI_CLASS_TARGET_ERROR; rsph->status_detail = ISCSI_LOGIN_STATUS_NO_RESOURCES; return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; } - rsp_pdu->data_buf_len = alloc_len; + rsp_pdu->data_buf_len = 8192; reqh = (struct iscsi_bhs_login_req *)&pdu->bhs; - rsph->flags |= (reqh->flags & ISCSI_LOGIN_TRANSIT); - rsph->flags |= (reqh->flags & ISCSI_LOGIN_CONTINUE); - rsph->flags |= (reqh->flags & ISCSI_LOGIN_CURRENT_STAGE_MASK); + rsph->flags |= (reqh->flags & (ISCSI_LOGIN_TRANSIT | ISCSI_LOGIN_CONTINUE | + ISCSI_LOGIN_CURRENT_STAGE_MASK)); if (ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) { rsph->flags |= (reqh->flags & ISCSI_LOGIN_NEXT_STAGE_MASK); } @@ -1310,11 +1293,9 @@ iscsi_op_login_rsp_init(struct spdk_iscsi_conn *conn, if ((ISCSI_BHS_LOGIN_GET_NSG(rsph->flags) == ISCSI_NSG_RESERVED_CODE) && ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) { - /* set NSG to zero */ - rsph->flags &= ~ISCSI_LOGIN_NEXT_STAGE_MASK; - /* also set other bits to zero */ - rsph->flags &= ~ISCSI_LOGIN_TRANSIT; - rsph->flags &= ~ISCSI_LOGIN_CURRENT_STAGE_MASK; + /* set NSG and other bits to zero */ + rsph->flags &= ~(ISCSI_LOGIN_NEXT_STAGE_MASK | ISCSI_LOGIN_TRANSIT | + ISCSI_LOGIN_CURRENT_STAGE_MASK); SPDK_ERRLOG("Received reserved NSG code: %d\n", ISCSI_NSG_RESERVED_CODE); /* Initiator error */ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; @@ -1446,8 +1427,8 @@ iscsi_op_login_check_target(struct spdk_iscsi_conn *conn, const char *target_name, struct spdk_iscsi_tgt_node **target) { - bool result; struct iscsi_bhs_login_rsp *rsph; + char buf[MAX_TMPBUF] = {}; rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; *target = iscsi_find_tgt_node(target_name); @@ -1464,10 +1445,19 @@ iscsi_op_login_check_target(struct spdk_iscsi_conn *conn, rsph->status_detail = ISCSI_LOGIN_TARGET_REMOVED; return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; } - result = iscsi_tgt_node_access(conn, *target, - conn->initiator_name, - conn->initiator_addr); - if (!result) { + if (iscsi_tgt_node_is_redirected(conn, *target, buf, MAX_TMPBUF)) { + SPDK_INFOLOG(SPDK_LOG_ISCSI, "target %s is redirectd\n", target_name); + rsp_pdu->data_segment_len = iscsi_append_text("TargetAddress", + buf, + rsp_pdu->data, + rsp_pdu->data_buf_len, + rsp_pdu->data_segment_len); + rsph->status_class = ISCSI_CLASS_REDIRECT; + rsph->status_detail = ISCSI_LOGIN_TARGET_TEMPORARILY_MOVED; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + if (!iscsi_tgt_node_access(conn, *target, conn->initiator_name, + conn->initiator_addr)) { SPDK_ERRLOG("access denied\n"); rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; rsph->status_detail = ISCSI_LOGIN_AUTHORIZATION_FAIL; @@ -2396,7 +2386,6 @@ iscsi_pdu_payload_op_text(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *p data_len = iscsi_send_tgts(conn, conn->initiator_name, - conn->initiator_addr, val, data, alloc_len, data_len); } else { @@ -2406,14 +2395,11 @@ iscsi_pdu_payload_op_text(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *p if (strcasecmp(val, "ALL") == 0) { /* not in discovery session */ - data_len = iscsi_append_text(conn, - "SendTargets", - "Reject", data, - alloc_len, data_len); + data_len = iscsi_append_text("SendTargets", "Reject", + data, alloc_len, data_len); } else { data_len = iscsi_send_tgts(conn, conn->initiator_name, - conn->initiator_addr, val, data, alloc_len, data_len); } @@ -2734,7 +2720,6 @@ add_transfer_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task) size_t segment_len; size_t data_len; int len; - int idx; int rc; int data_out_req; @@ -2750,15 +2735,14 @@ add_transfer_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task) * and start sending R2T for it after some of the tasks using R2T/data * out buffers complete. */ - if (conn->pending_r2t >= DEFAULT_MAXR2T) { + if (conn->pending_r2t >= g_iscsi.MaxR2TPerConnection) { TAILQ_INSERT_TAIL(&conn->queued_r2t_tasks, task, link); return 0; } conn->data_out_cnt += data_out_req; - idx = conn->pending_r2t++; + conn->pending_r2t++; - conn->outstanding_r2t_tasks[idx] = task; task->next_expected_r2t_offset = data_len; task->current_r2t_length = 0; task->R2TSN = 0; @@ -2801,7 +2785,7 @@ start_queued_transfer_tasks(struct spdk_iscsi_conn *conn) struct spdk_iscsi_task *task, *tmp; TAILQ_FOREACH_SAFE(task, &conn->queued_r2t_tasks, link, tmp) { - if (conn->pending_r2t < DEFAULT_MAXR2T) { + if (conn->pending_r2t < g_iscsi.MaxR2TPerConnection) { TAILQ_REMOVE(&conn->queued_r2t_tasks, task, link); add_transfer_task(conn, task); } else { @@ -2813,20 +2797,21 @@ start_queued_transfer_tasks(struct spdk_iscsi_conn *conn) bool iscsi_del_transfer_task(struct spdk_iscsi_conn *conn, uint32_t task_tag) { - struct spdk_iscsi_task *task; - int i; + struct spdk_iscsi_task *task, *tmp; - for (i = 0; i < conn->pending_r2t; i++) { - if (conn->outstanding_r2t_tasks[i]->tag == task_tag) { - task = conn->outstanding_r2t_tasks[i]; + TAILQ_FOREACH_SAFE(task, &conn->active_r2t_tasks, link, tmp) { + if (task->tag == task_tag) { assert(conn->data_out_cnt >= task->data_out_cnt); conn->data_out_cnt -= task->data_out_cnt; + assert(conn->pending_r2t > 0); conn->pending_r2t--; - for (; i < conn->pending_r2t; i++) { - conn->outstanding_r2t_tasks[i] = conn->outstanding_r2t_tasks[i + 1]; - } - conn->outstanding_r2t_tasks[conn->pending_r2t] = NULL; + + assert(task->is_r2t_active == true); + TAILQ_REMOVE(&conn->active_r2t_tasks, task, link); + task->is_r2t_active = false; + iscsi_task_put(task); + start_queued_transfer_tasks(conn); return true; } @@ -2834,26 +2819,26 @@ iscsi_del_transfer_task(struct spdk_iscsi_conn *conn, uint32_t task_tag) return false; } -static void -del_connection_queued_task(struct spdk_iscsi_conn *conn, void *tailq, - struct spdk_scsi_lun *lun, - struct spdk_iscsi_pdu *pdu) +void iscsi_clear_all_transfer_task(struct spdk_iscsi_conn *conn, + struct spdk_scsi_lun *lun, + struct spdk_iscsi_pdu *pdu) { struct spdk_iscsi_task *task, *task_tmp; struct spdk_iscsi_pdu *pdu_tmp; - /* - * Temporary used to index spdk_scsi_task related - * queues of the connection. - */ - TAILQ_HEAD(queued_tasks, spdk_iscsi_task) *head; - head = (struct queued_tasks *)tailq; - - TAILQ_FOREACH_SAFE(task, head, link, task_tmp) { + TAILQ_FOREACH_SAFE(task, &conn->active_r2t_tasks, link, task_tmp) { pdu_tmp = iscsi_task_get_pdu(task); if ((lun == NULL || lun == task->scsi.lun) && (pdu == NULL || spdk_sn32_lt(pdu_tmp->cmd_sn, pdu->cmd_sn))) { - TAILQ_REMOVE(head, task, link); + task->outstanding_r2t = 0; + task->next_r2t_offset = 0; + task->next_expected_r2t_offset = 0; + assert(conn->data_out_cnt >= task->data_out_cnt); + conn->data_out_cnt -= task->data_out_cnt; + assert(conn->pending_r2t > 0); + conn->pending_r2t--; + + TAILQ_REMOVE(&conn->active_r2t_tasks, task, link); task->is_r2t_active = false; if (lun != NULL && spdk_scsi_lun_is_removing(lun)) { spdk_scsi_task_process_null_lun(&task->scsi); @@ -2862,59 +2847,32 @@ del_connection_queued_task(struct spdk_iscsi_conn *conn, void *tailq, iscsi_task_put(task); } } -} -void iscsi_clear_all_transfer_task(struct spdk_iscsi_conn *conn, - struct spdk_scsi_lun *lun, - struct spdk_iscsi_pdu *pdu) -{ - int i, j, pending_r2t; - struct spdk_iscsi_task *task; - struct spdk_iscsi_pdu *pdu_tmp; - - pending_r2t = conn->pending_r2t; - for (i = 0; i < pending_r2t; i++) { - task = conn->outstanding_r2t_tasks[i]; + TAILQ_FOREACH_SAFE(task, &conn->queued_r2t_tasks, link, task_tmp) { pdu_tmp = iscsi_task_get_pdu(task); if ((lun == NULL || lun == task->scsi.lun) && (pdu == NULL || spdk_sn32_lt(pdu_tmp->cmd_sn, pdu->cmd_sn))) { - conn->outstanding_r2t_tasks[i] = NULL; - task->outstanding_r2t = 0; - task->next_r2t_offset = 0; - task->next_expected_r2t_offset = 0; - assert(conn->data_out_cnt >= task->data_out_cnt); - conn->data_out_cnt -= task->data_out_cnt; - conn->pending_r2t--; - } - } - - for (i = 0; i < pending_r2t; i++) { - if (conn->outstanding_r2t_tasks[i] != NULL) { - continue; - } - for (j = i + 1; j < pending_r2t; j++) { - if (conn->outstanding_r2t_tasks[j] != NULL) { - conn->outstanding_r2t_tasks[i] = conn->outstanding_r2t_tasks[j]; - conn->outstanding_r2t_tasks[j] = NULL; - break; + TAILQ_REMOVE(&conn->queued_r2t_tasks, task, link); + task->is_r2t_active = false; + if (lun != NULL && spdk_scsi_lun_is_removing(lun)) { + spdk_scsi_task_process_null_lun(&task->scsi); + iscsi_task_response(conn, task); } + iscsi_task_put(task); } } - del_connection_queued_task(conn, &conn->active_r2t_tasks, lun, pdu); - del_connection_queued_task(conn, &conn->queued_r2t_tasks, lun, pdu); - start_queued_transfer_tasks(conn); } static struct spdk_iscsi_task * get_transfer_task(struct spdk_iscsi_conn *conn, uint32_t transfer_tag) { - int i; + struct spdk_iscsi_task *task; - for (i = 0; i < conn->pending_r2t; i++) { - if (conn->outstanding_r2t_tasks[i]->ttt == transfer_tag) { - return (conn->outstanding_r2t_tasks[i]); + TAILQ_FOREACH(task, &conn->active_r2t_tasks, link) { + if (task->ttt == transfer_tag) { + return task; } } @@ -3092,8 +3050,7 @@ iscsi_transfer_in(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task) for (; offset < sequence_end; offset += segment_len) { len = spdk_min(segment_len, (sequence_end - offset)); - datain_flag &= ~ISCSI_FLAG_FINAL; - datain_flag &= ~ISCSI_DATAIN_STATUS; + datain_flag &= ~(ISCSI_FLAG_FINAL | ISCSI_DATAIN_STATUS); if (offset + len == sequence_end) { /* last PDU in a sequence */ @@ -3481,31 +3438,6 @@ iscsi_pdu_payload_op_scsi(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *p return SPDK_ISCSI_CONNECTION_FATAL; } -static void -abort_transfer_task_in_task_mgmt_resp(struct spdk_iscsi_conn *conn, - struct spdk_iscsi_task *task) -{ - struct spdk_iscsi_pdu *pdu; - - pdu = iscsi_task_get_pdu(task); - - switch (task->scsi.function) { - /* abort task identified by Reference Task Tag field */ - case ISCSI_TASK_FUNC_ABORT_TASK: - iscsi_del_transfer_task(conn, task->scsi.abort_id); - break; - - /* abort all tasks issued via this session on the LUN */ - case ISCSI_TASK_FUNC_ABORT_TASK_SET: - iscsi_clear_all_transfer_task(conn, task->scsi.lun, pdu); - break; - - case ISCSI_TASK_FUNC_LOGICAL_UNIT_RESET: - iscsi_clear_all_transfer_task(conn, task->scsi.lun, pdu); - break; - } -} - void iscsi_task_mgmt_response(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task) @@ -3530,11 +3462,9 @@ iscsi_task_mgmt_response(struct spdk_iscsi_conn *conn, rsph->flags |= 0x80; /* bit 0 default to 1 */ switch (task->scsi.response) { case SPDK_SCSI_TASK_MGMT_RESP_COMPLETE: - abort_transfer_task_in_task_mgmt_resp(conn, task); rsph->response = ISCSI_TASK_FUNC_RESP_COMPLETE; break; case SPDK_SCSI_TASK_MGMT_RESP_SUCCESS: - abort_transfer_task_in_task_mgmt_resp(conn, task); rsph->response = ISCSI_TASK_FUNC_RESP_COMPLETE; break; case SPDK_SCSI_TASK_MGMT_RESP_REJECT: @@ -3589,12 +3519,12 @@ _iscsi_op_abort_task(void *arg) rc = iscsi_conn_abort_queued_datain_task(task->conn, task->scsi.abort_id); if (rc != 0) { - return 1; + return SPDK_POLLER_BUSY; } spdk_poller_unregister(&task->mgmt_poller); iscsi_queue_mgmt_task(task->conn, task); - return 1; + return SPDK_POLLER_BUSY; } static void @@ -3614,12 +3544,12 @@ _iscsi_op_abort_task_set(void *arg) rc = iscsi_conn_abort_queued_datain_tasks(task->conn, task->scsi.lun, task->pdu); if (rc != 0) { - return 1; + return SPDK_POLLER_BUSY; } spdk_poller_unregister(&task->mgmt_poller); iscsi_queue_mgmt_task(task->conn, task); - return 1; + return SPDK_POLLER_BUSY; } void @@ -3686,6 +3616,7 @@ iscsi_pdu_hdr_op_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) case ISCSI_TASK_FUNC_ABORT_TASK: SPDK_NOTICELOG("ABORT_TASK\n"); + iscsi_del_transfer_task(conn, ref_task_tag); iscsi_op_abort_task(task, ref_task_tag); return 0; @@ -3693,6 +3624,7 @@ iscsi_pdu_hdr_op_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) case ISCSI_TASK_FUNC_ABORT_TASK_SET: SPDK_NOTICELOG("ABORT_TASK_SET\n"); + iscsi_clear_all_transfer_task(conn, task->scsi.lun, pdu); iscsi_op_abort_task_set(task, SPDK_SCSI_TASK_FUNC_ABORT_TASK_SET); return 0; @@ -3709,6 +3641,7 @@ iscsi_pdu_hdr_op_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) case ISCSI_TASK_FUNC_LOGICAL_UNIT_RESET: SPDK_NOTICELOG("LOGICAL_UNIT_RESET\n"); + iscsi_clear_all_transfer_task(conn, task->scsi.lun, pdu); iscsi_op_abort_task_set(task, SPDK_SCSI_TASK_FUNC_LUN_RESET); return 0; @@ -4649,6 +4582,17 @@ iscsi_read_pdu(struct spdk_iscsi_conn *conn) } } + /* conn->is_logged_out must be checked after completing to process + * logout request, i.e., before processing PDU header in this state + * machine, otherwise logout response may not be sent to initiator + * and initiator may get logout timeout. + */ + if (spdk_unlikely(conn->is_logged_out)) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "pdu received after logout\n"); + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR; + break; + } + pdu->data_segment_len = ISCSI_ALIGN(DGET24(pdu->bhs.data_segment_len)); /* AHS */ @@ -4776,12 +4720,6 @@ iscsi_read_pdu(struct spdk_iscsi_conn *conn) } } - if (conn->is_logged_out) { - SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "pdu received after logout\n"); - conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR; - break; - } - if (!pdu->is_rejected) { rc = iscsi_pdu_payload_handle(conn, pdu); } else { diff --git a/lib/iscsi/iscsi.h b/lib/iscsi/iscsi.h index b1747e4ab95..eb7e9bd3be4 100644 --- a/lib/iscsi/iscsi.h +++ b/lib/iscsi/iscsi.h @@ -91,11 +91,11 @@ #define MAX_DATA_OUT_PER_CONNECTION 16 /* - * Defines maximum number of data in buffers each connection can have in + * Defines default maximum number of data in buffers each connection can have in * use at any given time. So this limit does not affect I/O smaller than * SPDK_BDEV_SMALL_BUF_MAX_SIZE. */ -#define MAX_LARGE_DATAIN_PER_CONNECTION 64 +#define DEFAULT_MAX_LARGE_DATAIN_PER_CONNECTION 64 #define SPDK_ISCSI_MAX_BURST_LENGTH \ (SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH * MAX_DATA_OUT_PER_CONNECTION) @@ -320,6 +320,8 @@ struct spdk_iscsi_opts { bool ImmediateData; uint32_t ErrorRecoveryLevel; bool AllowDuplicateIsid; + uint32_t MaxLargeDataInPerConnection; + uint32_t MaxR2TPerConnection; }; struct spdk_iscsi_globals { @@ -351,6 +353,8 @@ struct spdk_iscsi_globals { bool ImmediateData; uint32_t ErrorRecoveryLevel; bool AllowDuplicateIsid; + uint32_t MaxLargeDataInPerConnection; + uint32_t MaxR2TPerConnection; struct spdk_mempool *pdu_pool; struct spdk_mempool *pdu_immediate_data_pool; diff --git a/lib/iscsi/iscsi_rpc.c b/lib/iscsi/iscsi_rpc.c index 6df6a3b9700..598ef28f0f2 100644 --- a/lib/iscsi/iscsi_rpc.c +++ b/lib/iscsi/iscsi_rpc.c @@ -731,6 +731,7 @@ struct rpc_portal_list { struct rpc_portal_group { int32_t tag; struct rpc_portal_list portal_list; + bool is_private; }; static void @@ -784,6 +785,7 @@ decode_rpc_portal_list(const struct spdk_json_val *val, void *out) static const struct spdk_json_object_decoder rpc_portal_group_decoders[] = { {"tag", offsetof(struct rpc_portal_group, tag), spdk_json_decode_int32}, {"portals", offsetof(struct rpc_portal_group, portal_list), decode_rpc_portal_list}, + {"private", offsetof(struct rpc_portal_group, is_private), spdk_json_decode_bool, true}, }; static void @@ -804,7 +806,7 @@ rpc_iscsi_create_portal_group(struct spdk_jsonrpc_request *request, goto out; } - pg = iscsi_portal_grp_create(req.tag); + pg = iscsi_portal_grp_create(req.tag, req.is_private); if (pg == NULL) { SPDK_ERRLOG("portal_grp_create failed\n"); goto out; @@ -977,7 +979,7 @@ _rpc_iscsi_get_connections(struct spdk_io_channel_iter *i) struct spdk_iscsi_poll_group *pg = spdk_io_channel_get_ctx(ch); struct spdk_iscsi_conn *conn; - STAILQ_FOREACH(conn, &pg->connections, link) { + STAILQ_FOREACH(conn, &pg->connections, pg_link) { iscsi_conn_info_json(ctx->w, conn); } @@ -1147,6 +1149,134 @@ SPDK_RPC_REGISTER("iscsi_target_node_set_auth", rpc_iscsi_target_node_set_auth, SPDK_RPC_RUNTIME) SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_target_node_set_auth, set_iscsi_target_node_auth) +struct rpc_target_redirect { + char *name; + int32_t pg_tag; + char *redirect_host; + char *redirect_port; +}; + +static void +free_rpc_target_redirect(struct rpc_target_redirect *req) +{ + free(req->name); + free(req->redirect_host); + free(req->redirect_port); +} + +static const struct spdk_json_object_decoder rpc_target_redirect_decoders[] = { + {"name", offsetof(struct rpc_target_redirect, name), spdk_json_decode_string}, + {"pg_tag", offsetof(struct rpc_target_redirect, pg_tag), spdk_json_decode_int32}, + {"redirect_host", offsetof(struct rpc_target_redirect, redirect_host), spdk_json_decode_string, true}, + {"redirect_port", offsetof(struct rpc_target_redirect, redirect_port), spdk_json_decode_string, true}, +}; + +static void +rpc_iscsi_target_node_set_redirect(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_target_redirect req = {}; + struct spdk_iscsi_tgt_node *target; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_target_redirect_decoders, + SPDK_COUNTOF(rpc_target_redirect_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_target_redirect(&req); + return; + } + + target = iscsi_find_tgt_node(req.name); + if (target == NULL) { + SPDK_ERRLOG("target %s is not found\n", req.name); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Target %s is not found", req.name); + free_rpc_target_redirect(&req); + return; + } + + rc = iscsi_tgt_node_redirect(target, req.pg_tag, req.redirect_host, req.redirect_port); + if (rc != 0) { + SPDK_ERRLOG("failed to redirect target %s\n", req.name); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Failed to redirect target %s, (%d): %s", + req.name, rc, spdk_strerror(-rc)); + free_rpc_target_redirect(&req); + return; + } + + free_rpc_target_redirect(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("iscsi_target_node_set_redirect", rpc_iscsi_target_node_set_redirect, + SPDK_RPC_RUNTIME) + +struct rpc_target_logout { + char *name; + int32_t pg_tag; +}; + +static void +free_rpc_target_logout(struct rpc_target_logout *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_target_logout_decoders[] = { + {"name", offsetof(struct rpc_target_logout, name), spdk_json_decode_string}, + {"pg_tag", offsetof(struct rpc_target_logout, pg_tag), spdk_json_decode_int32, true}, +}; + +static void +rpc_iscsi_target_node_request_logout(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_target_logout req = {}; + struct spdk_iscsi_tgt_node *target; + struct spdk_json_write_ctx *w; + + /* If pg_tag is omitted, request all connections to the specified target + * to logout. + */ + req.pg_tag = -1; + + if (spdk_json_decode_object(params, rpc_target_logout_decoders, + SPDK_COUNTOF(rpc_target_logout_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_target_logout(&req); + return; + } + + target = iscsi_find_tgt_node(req.name); + if (target == NULL) { + SPDK_ERRLOG("target %s is not found\n", req.name); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Target %s is not found", req.name); + free_rpc_target_logout(&req); + return; + } + + iscsi_conns_request_logout(target, req.pg_tag); + + free_rpc_target_logout(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("iscsi_target_node_request_logout", rpc_iscsi_target_node_request_logout, + SPDK_RPC_RUNTIME) + static void rpc_iscsi_get_options(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) @@ -1586,6 +1716,8 @@ static const struct spdk_json_object_decoder rpc_set_iscsi_opts_decoders[] = { {"immediate_data", offsetof(struct spdk_iscsi_opts, ImmediateData), spdk_json_decode_bool, true}, {"error_recovery_level", offsetof(struct spdk_iscsi_opts, ErrorRecoveryLevel), spdk_json_decode_uint32, true}, {"allow_duplicated_isid", offsetof(struct spdk_iscsi_opts, AllowDuplicateIsid), spdk_json_decode_bool, true}, + {"max_large_datain_per_connection", offsetof(struct spdk_iscsi_opts, MaxLargeDataInPerConnection), spdk_json_decode_uint32, true}, + {"max_r2t_per_connection", offsetof(struct spdk_iscsi_opts, MaxR2TPerConnection), spdk_json_decode_uint32, true}, }; static void diff --git a/lib/iscsi/iscsi_subsystem.c b/lib/iscsi/iscsi_subsystem.c index 66831e32cbf..0f66dcb41b4 100644 --- a/lib/iscsi/iscsi_subsystem.c +++ b/lib/iscsi/iscsi_subsystem.c @@ -139,7 +139,9 @@ mobj_ctor(struct spdk_mempool *mp, __attribute__((unused)) void *arg, ~ISCSI_DATA_BUFFER_MASK); } -#define NUM_PDU_PER_CONNECTION(iscsi) (2 * (iscsi->MaxQueueDepth + MAX_LARGE_DATAIN_PER_CONNECTION + 8)) +#define NUM_PDU_PER_CONNECTION(iscsi) (2 * (iscsi->MaxQueueDepth + \ + iscsi->MaxLargeDataInPerConnection + \ + 2 * iscsi->MaxR2TPerConnection + 8)) #define PDU_POOL_SIZE(iscsi) (iscsi->MaxConnections * NUM_PDU_PER_CONNECTION(iscsi)) #define IMMEDIATE_DATA_POOL_SIZE(iscsi) (iscsi->MaxConnections * 128) #define DATA_OUT_POOL_SIZE(iscsi) (iscsi->MaxConnections * MAX_DATA_OUT_PER_CONNECTION) @@ -376,6 +378,12 @@ iscsi_log_globals(void) "DiscoveryAuthGroup AuthGroup%d\n", g_iscsi.chap_group); } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxLargeDataInPerConnection %d\n", + g_iscsi.MaxLargeDataInPerConnection); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxR2TPerConnection %d\n", + g_iscsi.MaxR2TPerConnection); } static void @@ -398,6 +406,8 @@ iscsi_opts_init(struct spdk_iscsi_opts *opts) opts->chap_group = 0; opts->authfile = NULL; opts->nodebase = NULL; + opts->MaxLargeDataInPerConnection = DEFAULT_MAX_LARGE_DATAIN_PER_CONNECTION; + opts->MaxR2TPerConnection = DEFAULT_MAXR2T; } struct spdk_iscsi_opts * @@ -470,6 +480,8 @@ iscsi_opts_copy(struct spdk_iscsi_opts *src) dst->require_chap = src->require_chap; dst->mutual_chap = src->mutual_chap; dst->chap_group = src->chap_group; + dst->MaxLargeDataInPerConnection = src->MaxLargeDataInPerConnection; + dst->MaxR2TPerConnection = src->MaxR2TPerConnection; return dst; } @@ -694,6 +706,16 @@ iscsi_opts_verify(struct spdk_iscsi_opts *opts) return -EINVAL; } + if (opts->MaxLargeDataInPerConnection == 0) { + SPDK_ERRLOG("0 is invalid. MaxLargeDataInPerConnection must be more than 0\n"); + return -EINVAL; + } + + if (opts->MaxR2TPerConnection == 0) { + SPDK_ERRLOG("0 is invalid. MaxR2TPerConnection must be more than 0\n"); + return -EINVAL; + } + return 0; } @@ -767,6 +789,8 @@ iscsi_set_global_params(struct spdk_iscsi_opts *opts) g_iscsi.require_chap = opts->require_chap; g_iscsi.mutual_chap = opts->mutual_chap; g_iscsi.chap_group = opts->chap_group; + g_iscsi.MaxLargeDataInPerConnection = opts->MaxLargeDataInPerConnection; + g_iscsi.MaxR2TPerConnection = opts->MaxR2TPerConnection; iscsi_log_globals(); @@ -1189,7 +1213,7 @@ iscsi_poll_group_poll(void *ctx) int rc; if (spdk_unlikely(STAILQ_EMPTY(&group->connections))) { - return 0; + return SPDK_POLLER_IDLE; } rc = spdk_sock_group_poll(group->sock_group); @@ -1197,13 +1221,13 @@ iscsi_poll_group_poll(void *ctx) SPDK_ERRLOG("Failed to poll sock_group=%p\n", group->sock_group); } - STAILQ_FOREACH_SAFE(conn, &group->connections, link, tmp) { + STAILQ_FOREACH_SAFE(conn, &group->connections, pg_link, tmp) { if (conn->state == ISCSI_CONN_STATE_EXITING) { iscsi_conn_destruct(conn); } } - return rc; + return rc != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; } static int @@ -1212,11 +1236,11 @@ iscsi_poll_group_handle_nop(void *ctx) struct spdk_iscsi_poll_group *group = ctx; struct spdk_iscsi_conn *conn, *tmp; - STAILQ_FOREACH_SAFE(conn, &group->connections, link, tmp) { + STAILQ_FOREACH_SAFE(conn, &group->connections, pg_link, tmp) { iscsi_conn_handle_nop(conn); } - return -1; + return SPDK_POLLER_BUSY; } static int @@ -1483,6 +1507,11 @@ iscsi_opts_info_json(struct spdk_json_write_ctx *w) spdk_json_write_named_bool(w, "mutual_chap", g_iscsi.mutual_chap); spdk_json_write_named_int32(w, "chap_group", g_iscsi.chap_group); + spdk_json_write_named_uint32(w, "max_large_datain_per_connection", + g_iscsi.MaxLargeDataInPerConnection); + spdk_json_write_named_uint32(w, "max_r2t_per_connection", + g_iscsi.MaxR2TPerConnection); + spdk_json_write_object_end(w); } diff --git a/lib/iscsi/portal_grp.c b/lib/iscsi/portal_grp.c index 986562ad77f..3203832b4c8 100644 --- a/lib/iscsi/portal_grp.c +++ b/lib/iscsi/portal_grp.c @@ -216,13 +216,14 @@ iscsi_portal_close(struct spdk_iscsi_portal *p) static int iscsi_parse_portal(const char *portalstring, struct spdk_iscsi_portal **ip) { - char *host = NULL, *port = NULL; - int len, rc = -1; + char host[MAX_PORTAL_ADDR + 1] = {}; + char port[MAX_PORTAL_PORT + 1] = {}; + int len; const char *p; if (portalstring == NULL) { SPDK_ERRLOG("portal error\n"); - goto error_out; + return -EINVAL; } /* IP address */ @@ -231,7 +232,7 @@ iscsi_parse_portal(const char *portalstring, struct spdk_iscsi_portal **ip) p = strchr(portalstring + 1, ']'); if (p == NULL) { SPDK_ERRLOG("portal error\n"); - goto error_out; + return -EINVAL; } p++; } else { @@ -243,29 +244,20 @@ iscsi_parse_portal(const char *portalstring, struct spdk_iscsi_portal **ip) } len = p - portalstring; - host = malloc(len + 1); - if (host == NULL) { - SPDK_ERRLOG("malloc() failed for host\n"); - goto error_out; + if (len > MAX_PORTAL_ADDR) { + return -EINVAL; } memcpy(host, portalstring, len); host[len] = '\0'; /* Port number (IPv4 and IPv6 are the same) */ if (p[0] == '\0') { - port = malloc(PORTNUMSTRLEN); - if (!port) { - SPDK_ERRLOG("malloc() failed for port\n"); - goto error_out; - } - snprintf(port, PORTNUMSTRLEN, "%d", DEFAULT_PORT); + snprintf(port, MAX_PORTAL_PORT, "%d", DEFAULT_PORT); } else { p++; len = strlen(p); - port = malloc(len + 1); - if (port == NULL) { - SPDK_ERRLOG("malloc() failed for port\n"); - goto error_out; + if (len > MAX_PORTAL_PORT) { + return -EINVAL; } memcpy(port, p, len); port[len] = '\0'; @@ -273,19 +265,48 @@ iscsi_parse_portal(const char *portalstring, struct spdk_iscsi_portal **ip) *ip = iscsi_portal_create(host, port); if (!*ip) { - goto error_out; + return -EINVAL; } - rc = 0; -error_out: - free(host); - free(port); + return 0; +} + +int +iscsi_parse_redirect_addr(struct sockaddr_storage *sa, + const char *host, const char *port) +{ + struct addrinfo hints, *res; + int rc; + + if (host == NULL || port == NULL) { + return -EINVAL; + } + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = PF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_NUMERICSERV; + hints.ai_flags |= AI_NUMERICHOST; + rc = getaddrinfo(host, port, &hints, &res); + if (rc != 0) { + SPDK_ERRLOG("getaddinrfo failed: %s (%d)\n", gai_strerror(rc), rc); + return -EINVAL; + } + + if (res->ai_addrlen > sizeof(*sa)) { + SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", + (size_t)res->ai_addrlen); + rc = -EINVAL; + } else { + memcpy(sa, res->ai_addr, res->ai_addrlen); + } + freeaddrinfo(res); return rc; } struct spdk_iscsi_portal_grp * -iscsi_portal_grp_create(int tag) +iscsi_portal_grp_create(int tag, bool is_private) { struct spdk_iscsi_portal_grp *pg = malloc(sizeof(*pg)); @@ -296,6 +317,7 @@ iscsi_portal_grp_create(int tag) pg->ref = 0; pg->tag = tag; + pg->is_private = is_private; pthread_mutex_lock(&g_iscsi.mutex); pg->disable_chap = g_iscsi.disable_chap; @@ -354,6 +376,21 @@ iscsi_portal_grp_add_portal(struct spdk_iscsi_portal_grp *pg, TAILQ_INSERT_TAIL(&pg->head, p, per_pg_tailq); } +struct spdk_iscsi_portal * +iscsi_portal_grp_find_portal_by_addr(struct spdk_iscsi_portal_grp *pg, + const char *host, const char *port) +{ + struct spdk_iscsi_portal *p; + + TAILQ_FOREACH(p, &pg->head, per_pg_tailq) { + if (!strcmp(p->host, host) && !strcmp(p->port, port)) { + return p; + } + } + + return NULL; +} + int iscsi_portal_grp_set_chap_params(struct spdk_iscsi_portal_grp *pg, bool disable_chap, bool require_chap, @@ -389,7 +426,7 @@ iscsi_parse_portal_grp(struct spdk_conf_section *sp) SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val); } - pg = iscsi_portal_grp_create(spdk_conf_section_get_num(sp)); + pg = iscsi_portal_grp_create(spdk_conf_section_get_num(sp), false); if (!pg) { SPDK_ERRLOG("portal group malloc error (%s)\n", spdk_conf_section_get_name(sp)); return -1; @@ -617,6 +654,8 @@ iscsi_portal_grp_info_json(struct spdk_iscsi_portal_grp *pg, } spdk_json_write_array_end(w); + spdk_json_write_named_bool(w, "private", pg->is_private); + spdk_json_write_object_end(w); } diff --git a/lib/iscsi/portal_grp.h b/lib/iscsi/portal_grp.h index 7ac72e36c7e..07682edbc9e 100644 --- a/lib/iscsi/portal_grp.h +++ b/lib/iscsi/portal_grp.h @@ -54,6 +54,16 @@ struct spdk_iscsi_portal { struct spdk_iscsi_portal_grp { int ref; int tag; + + /* For login redirection, there are two types of portal groups, public and + * private portal groups. Public portal groups have their portals returned + * by a discovery session. Private portal groups do not have their portals + * returned by a discovery session. A public portal group may optionally + * specify a redirect portal for non-discovery logins. This redirect portal + * must be from a private portal group. + */ + bool is_private; + bool disable_chap; bool require_chap; bool mutual_chap; @@ -67,9 +77,12 @@ struct spdk_iscsi_portal_grp { struct spdk_iscsi_portal *iscsi_portal_create(const char *host, const char *port); void iscsi_portal_destroy(struct spdk_iscsi_portal *p); -struct spdk_iscsi_portal_grp *iscsi_portal_grp_create(int tag); +struct spdk_iscsi_portal_grp *iscsi_portal_grp_create(int tag, bool is_private); void iscsi_portal_grp_add_portal(struct spdk_iscsi_portal_grp *pg, struct spdk_iscsi_portal *p); +struct spdk_iscsi_portal *iscsi_portal_grp_find_portal_by_addr( + struct spdk_iscsi_portal_grp *pg, const char *host, const char *port); + void iscsi_portal_grp_destroy(struct spdk_iscsi_portal_grp *pg); void iscsi_portal_grp_release(struct spdk_iscsi_portal_grp *pg); int iscsi_parse_portal_grps(void); @@ -87,4 +100,7 @@ void iscsi_portal_grps_config_text(FILE *fp); void iscsi_portal_grps_info_json(struct spdk_json_write_ctx *w); void iscsi_portal_grps_config_json(struct spdk_json_write_ctx *w); +int iscsi_parse_redirect_addr(struct sockaddr_storage *sa, + const char *host, const char *port); + #endif /* SPDK_PORTAL_GRP_H */ diff --git a/lib/iscsi/tgt_node.c b/lib/iscsi/tgt_node.c index f07b8d1d532..293f4190aaa 100644 --- a/lib/iscsi/tgt_node.c +++ b/lib/iscsi/tgt_node.c @@ -47,7 +47,7 @@ #include "iscsi/init_grp.h" #include "iscsi/task.h" -#define MAX_TMPBUF 1024 +#define MAX_TMPBUF 4096 #define MAX_MASKBUF 128 static bool @@ -296,17 +296,66 @@ iscsi_tgt_node_allow_iscsi_name(struct spdk_iscsi_tgt_node *target, const char * return false; } -int -iscsi_send_tgts(struct spdk_iscsi_conn *conn, const char *iiqn, - const char *iaddr, const char *tiqn, uint8_t *data, int alloc_len, - int data_len) +static int +iscsi_send_tgt_portals(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target, + uint8_t *data, int alloc_len, int total) { char buf[MAX_TMPBUF]; - struct spdk_iscsi_portal_grp *pg; - struct spdk_iscsi_pg_map *pg_map; - struct spdk_iscsi_portal *p; - struct spdk_iscsi_tgt_node *target; + struct spdk_iscsi_portal_grp *pg; + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_portal *p; char *host; + int len; + + TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) { + pg = pg_map->pg; + + if (pg->is_private) { + /* Skip the private portal group. Portals in the private portal group + * will be returned only by temporary login redirection responses. + */ + continue; + } + + TAILQ_FOREACH(p, &pg->head, per_pg_tailq) { + if (alloc_len - total < 1) { + /* TODO: long text responses support */ + SPDK_ERRLOG("SPDK doesn't support long text responses now, " + "you can use larger MaxRecvDataSegmentLength" + "value in initiator\n"); + return alloc_len; + } + host = p->host; + /* wildcard? */ + if (strcasecmp(host, "[::]") == 0 || strcasecmp(host, "0.0.0.0") == 0) { + if (spdk_sock_is_ipv6(conn->sock)) { + snprintf(buf, sizeof buf, "[%s]", conn->target_addr); + host = buf; + } else if (spdk_sock_is_ipv4(conn->sock)) { + snprintf(buf, sizeof buf, "%s", conn->target_addr); + host = buf; + } else { + /* skip portal for the family */ + continue; + } + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "TargetAddress=%s:%s,%d\n", + host, p->port, pg->tag); + len = snprintf((char *)data + total, alloc_len - total, + "TargetAddress=%s:%s,%d", host, p->port, pg->tag); + total += len + 1; + } + } + + return total; +} + +int +iscsi_send_tgts(struct spdk_iscsi_conn *conn, const char *iiqn, + const char *tiqn, uint8_t *data, int alloc_len, int data_len) +{ + struct spdk_iscsi_tgt_node *target; int total; int len; int rc; @@ -319,17 +368,12 @@ iscsi_send_tgts(struct spdk_iscsi_conn *conn, const char *iiqn, if (alloc_len < 1) { return 0; } - if (total > alloc_len) { + if (total >= alloc_len) { total = alloc_len; data[total - 1] = '\0'; return total; } - if (alloc_len - total < 1) { - SPDK_ERRLOG("data space small %d\n", alloc_len); - return total; - } - pthread_mutex_lock(&g_iscsi.mutex); TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) { if (strcasecmp(tiqn, "ALL") != 0 @@ -341,46 +385,13 @@ iscsi_send_tgts(struct spdk_iscsi_conn *conn, const char *iiqn, continue; } - /* DO SENDTARGETS */ - len = snprintf((char *) data + total, alloc_len - total, - "TargetName=%s", target->name); + len = snprintf((char *)data + total, alloc_len - total, "TargetName=%s", + target->name); total += len + 1; - /* write to data */ - TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) { - pg = pg_map->pg; - TAILQ_FOREACH(p, &pg->head, per_pg_tailq) { - if (alloc_len - total < 1) { - pthread_mutex_unlock(&g_iscsi.mutex); - SPDK_ERRLOG("data space small %d\n", alloc_len); - return total; - } - host = p->host; - /* wildcard? */ - if (strcasecmp(host, "[::]") == 0 - || strcasecmp(host, "0.0.0.0") == 0) { - if (spdk_sock_is_ipv6(conn->sock)) { - snprintf(buf, sizeof buf, "[%s]", - conn->target_addr); - host = buf; - } else if (spdk_sock_is_ipv4(conn->sock)) { - snprintf(buf, sizeof buf, "%s", - conn->target_addr); - host = buf; - } else { - /* skip portal for the family */ - continue; - } - } - SPDK_DEBUGLOG(SPDK_LOG_ISCSI, - "TargetAddress=%s:%s,%d\n", - host, p->port, pg->tag); - len = snprintf((char *) data + total, - alloc_len - total, - "TargetAddress=%s:%s,%d", - host, p->port, pg->tag); - total += len + 1; - } + total = iscsi_send_tgt_portals(conn, target, data, alloc_len, total); + if (alloc_len - total < 1) { + break; } } pthread_mutex_unlock(&g_iscsi.mutex); @@ -401,7 +412,6 @@ iscsi_find_tgt_node(const char *target_name) return target; } } - SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "can't find target %s\n", target_name); return NULL; } @@ -542,7 +552,7 @@ iscsi_tgt_node_add_pg_map(struct spdk_iscsi_tgt_node *target, return NULL; } - pg_map = malloc(sizeof(*pg_map)); + pg_map = calloc(1, sizeof(*pg_map)); if (pg_map == NULL) { return NULL; } @@ -656,14 +666,14 @@ iscsi_tgt_node_check_active_conns(void *arg) struct spdk_iscsi_tgt_node *target = arg; if (iscsi_get_active_conns(target) != 0) { - return 1; + return SPDK_POLLER_BUSY; } spdk_poller_unregister(&target->destruct_poller); spdk_scsi_dev_destruct(target->dev, _iscsi_tgt_node_destruct, target); - return 1; + return SPDK_POLLER_BUSY; } static void @@ -689,7 +699,7 @@ iscsi_tgt_node_destruct(struct spdk_iscsi_tgt_node *target, target->destruct_cb_fn = cb_fn; target->destruct_cb_arg = cb_arg; - iscsi_conns_request_logout(target); + iscsi_conns_request_logout(target, -1); if (iscsi_get_active_conns(target) != 0) { target->destruct_poller = SPDK_POLLER_REGISTER(iscsi_tgt_node_check_active_conns, @@ -846,6 +856,83 @@ iscsi_target_node_remove_pg_ig_maps(struct spdk_iscsi_tgt_node *target, return -1; } +int +iscsi_tgt_node_redirect(struct spdk_iscsi_tgt_node *target, int pg_tag, + const char *host, const char *port) +{ + struct spdk_iscsi_portal_grp *pg; + struct spdk_iscsi_pg_map *pg_map; + struct sockaddr_storage sa; + + if (target == NULL) { + return -EINVAL; + } + + pg = iscsi_portal_grp_find_by_tag(pg_tag); + if (pg == NULL) { + SPDK_ERRLOG("Portal group %d is not found.\n", pg_tag); + return -EINVAL; + } + + if (pg->is_private) { + SPDK_ERRLOG("Portal group %d is not public portal group.\n", pg_tag); + return -EINVAL; + } + + pg_map = iscsi_tgt_node_find_pg_map(target, pg); + if (pg_map == NULL) { + SPDK_ERRLOG("Portal group %d is not mapped.\n", pg_tag); + return -EINVAL; + } + + if (host == NULL && port == NULL) { + /* Clear redirect setting. */ + memset(pg_map->redirect_host, 0, MAX_PORTAL_ADDR + 1); + memset(pg_map->redirect_port, 0, MAX_PORTAL_PORT + 1); + } else { + if (iscsi_parse_redirect_addr(&sa, host, port) != 0) { + SPDK_ERRLOG("IP address-port pair is not valid.\n"); + return -EINVAL; + } + + if (iscsi_portal_grp_find_portal_by_addr(pg, port, host) != NULL) { + SPDK_ERRLOG("IP address-port pair must be chosen from a " + "different private portal group\n"); + return -EINVAL; + } + + snprintf(pg_map->redirect_host, MAX_PORTAL_ADDR + 1, "%s", host); + snprintf(pg_map->redirect_port, MAX_PORTAL_PORT + 1, "%s", port); + } + + return 0; +} + +bool +iscsi_tgt_node_is_redirected(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target, + char *buf, int buf_len) +{ + struct spdk_iscsi_pg_map *pg_map; + + if (conn == NULL || target == NULL || buf == NULL || buf_len == 0) { + return false; + } + + pg_map = iscsi_tgt_node_find_pg_map(target, conn->portal->group); + if (pg_map == NULL) { + return false; + } + + if (pg_map->redirect_host[0] == '\0' || pg_map->redirect_port[0] == '\0') { + return false; + } + + snprintf(buf, buf_len, "%s:%s", pg_map->redirect_host, pg_map->redirect_port); + + return true; +} + static int check_iscsi_name(const char *name) { diff --git a/lib/iscsi/tgt_node.h b/lib/iscsi/tgt_node.h index 2787fac91f2..93c25484098 100644 --- a/lib/iscsi/tgt_node.h +++ b/lib/iscsi/tgt_node.h @@ -59,6 +59,8 @@ struct spdk_iscsi_pg_map { struct spdk_iscsi_portal_grp *pg; int num_ig_maps; TAILQ_HEAD(, spdk_iscsi_ig_map) ig_map_head; + char redirect_host[MAX_PORTAL_ADDR + 1]; + char redirect_port[MAX_PORTAL_PORT + 1]; TAILQ_ENTRY(spdk_iscsi_pg_map) tailq ; }; @@ -102,8 +104,7 @@ void iscsi_shutdown_tgt_node_by_name(const char *target_name, iscsi_tgt_node_destruct_cb cb_fn, void *cb_arg); bool iscsi_tgt_node_is_destructed(struct spdk_iscsi_tgt_node *target); int iscsi_send_tgts(struct spdk_iscsi_conn *conn, const char *iiqn, - const char *iaddr, const char *tiqn, uint8_t *data, int alloc_len, - int data_len); + const char *tiqn, uint8_t *data, int alloc_len, int data_len); /* * bdev_name_list and lun_id_list are equal sized arrays of size num_luns. @@ -127,6 +128,11 @@ int iscsi_target_node_add_pg_ig_maps(struct spdk_iscsi_tgt_node *target, int iscsi_target_node_remove_pg_ig_maps(struct spdk_iscsi_tgt_node *target, int *pg_tag_list, int *ig_tag_list, uint16_t num_maps); +int iscsi_tgt_node_redirect(struct spdk_iscsi_tgt_node *target, int pg_tag, + const char *host, const char *port); +bool iscsi_tgt_node_is_redirected(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target, + char *buf, int buf_len); bool iscsi_tgt_node_access(struct spdk_iscsi_conn *conn, struct spdk_iscsi_tgt_node *target, const char *iqn, diff --git a/lib/jsonrpc/jsonrpc_server_tcp.c b/lib/jsonrpc/jsonrpc_server_tcp.c index 9effc41bbc5..1e38f713fd2 100644 --- a/lib/jsonrpc/jsonrpc_server_tcp.c +++ b/lib/jsonrpc/jsonrpc_server_tcp.c @@ -216,15 +216,21 @@ jsonrpc_server_accept(struct spdk_jsonrpc_server *server) conn->closed = false; conn->recv_len = 0; conn->outstanding_requests = 0; - pthread_spin_init(&conn->queue_lock, PTHREAD_PROCESS_PRIVATE); STAILQ_INIT(&conn->send_queue); conn->send_request = NULL; + if (pthread_spin_init(&conn->queue_lock, PTHREAD_PROCESS_PRIVATE)) { + SPDK_ERRLOG("Unable to create queue lock for socket: %d", conn->sockfd); + close(conn->sockfd); + return -1; + } + flag = fcntl(conn->sockfd, F_GETFL); if (fcntl(conn->sockfd, F_SETFL, flag | O_NONBLOCK) < 0) { SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", conn->sockfd, spdk_strerror(errno)); close(conn->sockfd); + pthread_spin_destroy(&conn->queue_lock); return -1; } diff --git a/lib/log/Makefile b/lib/log/Makefile index 4e7c257585e..29115635075 100644 --- a/lib/log/Makefile +++ b/lib/log/Makefile @@ -35,7 +35,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk SO_VER := 3 -SO_MINOR := 0 +SO_MINOR := 1 SO_SUFFIX := $(SO_VER).$(SO_MINOR) C_SRCS = log.c log_flags.c diff --git a/lib/log/log.c b/lib/log/log.c index 315e13cf93c..1d4cc3712df 100644 --- a/lib/log/log.c +++ b/lib/log/log.c @@ -46,6 +46,7 @@ static const char *const spdk_level_names[] = { #define MAX_TMPBUF 1024 static logfunc *g_log = NULL; +static bool g_log_timestamps = true; void spdk_log_open(logfunc *logf) @@ -65,6 +66,12 @@ spdk_log_close(void) } } +void +spdk_log_enable_timestamps(bool value) +{ + g_log_timestamps = value; +} + static void get_timestamp_prefix(char *buf, int buf_size) { @@ -73,9 +80,18 @@ get_timestamp_prefix(char *buf, int buf_size) struct timespec ts; long usec; + if (!g_log_timestamps) { + buf[0] = '\0'; + return; + } + clock_gettime(CLOCK_REALTIME, &ts); info = localtime(&ts.tv_sec); usec = ts.tv_nsec / 1000; + if (info == NULL) { + snprintf(buf, buf_size, "[%s.%06ld] ", "unknown date", usec); + return; + } strftime(date, sizeof(date), "%Y-%m-%d %H:%M:%S", info); snprintf(buf, buf_size, "[%s.%06ld] ", date, usec); diff --git a/lib/log/spdk_log.map b/lib/log/spdk_log.map index 84629d55562..60283d2636c 100644 --- a/lib/log/spdk_log.map +++ b/lib/log/spdk_log.map @@ -15,6 +15,7 @@ spdk_log_set_flag; spdk_log_clear_flag; spdk_log_usage; + spdk_log_enable_timestamps; # functions used by other SPDK libraries spdk_log_register_flag; diff --git a/lib/nbd/nbd.c b/lib/nbd/nbd.c index 28549465826..56fe037d92d 100644 --- a/lib/nbd/nbd.c +++ b/lib/nbd/nbd.c @@ -546,6 +546,7 @@ static int nbd_io_exec(struct spdk_nbd_disk *nbd) { struct nbd_io *io, *io_tmp; + int io_count = 0; int ret = 0; /* @@ -561,12 +562,14 @@ nbd_io_exec(struct spdk_nbd_disk *nbd) TAILQ_REMOVE(&nbd->received_io_list, io, tailq); ret = nbd_submit_bdev_io(nbd, io); if (ret < 0) { - break; + return ret; } + + io_count++; } } - return ret; + return io_count; } static int @@ -574,6 +577,7 @@ nbd_io_recv_internal(struct spdk_nbd_disk *nbd) { struct nbd_io *io; int ret = 0; + int received = 0; if (nbd->io_in_recv == NULL) { nbd->io_in_recv = nbd_get_io(nbd); @@ -594,6 +598,7 @@ nbd_io_recv_internal(struct spdk_nbd_disk *nbd) } io->offset += ret; + received = ret; /* request is fully received */ if (io->offset == sizeof(io->req)) { @@ -649,6 +654,7 @@ nbd_io_recv_internal(struct spdk_nbd_disk *nbd) } io->offset += ret; + received += ret; /* request payload is fully received */ if (io->offset == io->payload_size) { @@ -660,13 +666,13 @@ nbd_io_recv_internal(struct spdk_nbd_disk *nbd) } - return 0; + return received; } static int nbd_io_recv(struct spdk_nbd_disk *nbd) { - int i, ret = 0; + int i, rc, ret = 0; /* * nbd server should not accept request in both soft and hard @@ -677,13 +683,14 @@ nbd_io_recv(struct spdk_nbd_disk *nbd) } for (i = 0; i < GET_IO_LOOP_COUNT; i++) { - ret = nbd_io_recv_internal(nbd); - if (ret != 0) { - return ret; + rc = nbd_io_recv_internal(nbd); + if (rc < 0) { + return rc; } + ret += rc; } - return 0; + return ret; } static int @@ -691,6 +698,7 @@ nbd_io_xmit_internal(struct spdk_nbd_disk *nbd) { struct nbd_io *io; int ret = 0; + int sent = 0; io = TAILQ_FIRST(&nbd->executed_io_list); if (io == NULL) { @@ -713,6 +721,7 @@ nbd_io_xmit_internal(struct spdk_nbd_disk *nbd) } io->offset += ret; + sent = ret; /* response is fully transmitted */ if (io->offset == sizeof(io->resp)) { @@ -735,23 +744,25 @@ nbd_io_xmit_internal(struct spdk_nbd_disk *nbd) } io->offset += ret; + sent += ret; /* read payload is fully transmitted */ if (io->offset == io->payload_size) { nbd_put_io(nbd, io); - return 0; + return sent; } } reinsert: TAILQ_INSERT_HEAD(&nbd->executed_io_list, io, tailq); - return ret; + return ret < 0 ? ret : sent; } static int nbd_io_xmit(struct spdk_nbd_disk *nbd) { int ret = 0; + int rc; /* * For soft disconnection, nbd server must handle all outstanding @@ -762,10 +773,12 @@ nbd_io_xmit(struct spdk_nbd_disk *nbd) } while (!TAILQ_EMPTY(&nbd->executed_io_list)) { - ret = nbd_io_xmit_internal(nbd); - if (ret != 0) { - return ret; + rc = nbd_io_xmit_internal(nbd); + if (rc < 0) { + return rc; } + + ret += rc; } /* @@ -776,7 +789,7 @@ nbd_io_xmit(struct spdk_nbd_disk *nbd) return -1; } - return 0; + return ret; } /** @@ -787,22 +800,25 @@ nbd_io_xmit(struct spdk_nbd_disk *nbd) static int _nbd_poll(struct spdk_nbd_disk *nbd) { - int rc; + int received, sent, executed; /* transmit executed io first */ - rc = nbd_io_xmit(nbd); - if (rc < 0) { - return rc; + sent = nbd_io_xmit(nbd); + if (sent < 0) { + return sent; } - rc = nbd_io_recv(nbd); - if (rc < 0) { - return rc; + received = nbd_io_recv(nbd); + if (received < 0) { + return received; } - rc = nbd_io_exec(nbd); + executed = nbd_io_exec(nbd); + if (executed < 0) { + return executed; + } - return rc; + return sent + received + executed; } static int @@ -818,7 +834,7 @@ nbd_poll(void *arg) spdk_nbd_stop(nbd); } - return -1; + return rc > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; } static void * @@ -942,7 +958,7 @@ nbd_enable_kernel(void *arg) NBD_BUSY_POLLING_INTERVAL_US); } /* If the kernel is busy, check back later */ - return 0; + return SPDK_POLLER_BUSY; } SPDK_ERRLOG("ioctl(NBD_SET_SOCK) failed: %s\n", spdk_strerror(errno)); @@ -957,7 +973,7 @@ nbd_enable_kernel(void *arg) } free(ctx); - return 1; + return SPDK_POLLER_BUSY; } if (ctx->poller) { @@ -966,7 +982,7 @@ nbd_enable_kernel(void *arg) nbd_start_complete(ctx); - return 1; + return SPDK_POLLER_BUSY; } void @@ -1044,7 +1060,7 @@ spdk_nbd_start(const char *bdev_name, const char *nbd_path, goto err; } - nbd->dev_fd = open(nbd_path, O_RDWR); + nbd->dev_fd = open(nbd_path, O_RDWR | O_DIRECT); if (nbd->dev_fd == -1) { SPDK_ERRLOG("open(\"%s\") failed: %s\n", nbd_path, spdk_strerror(errno)); rc = -errno; diff --git a/lib/nvme/Makefile b/lib/nvme/Makefile index 1c02965f534..9fe86c090c7 100644 --- a/lib/nvme/Makefile +++ b/lib/nvme/Makefile @@ -35,7 +35,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk SO_VER := 4 -SO_MINOR := 0 +SO_MINOR := 1 C_SRCS = nvme_ctrlr_cmd.c nvme_ctrlr.c nvme_fabric.c nvme_ns_cmd.c nvme_ns.c nvme_pcie.c nvme_qpair.c nvme.c nvme_quirks.c nvme_transport.c nvme_uevent.c nvme_ctrlr_ocssd_cmd.c \ nvme_ns_ocssd_cmd.c nvme_tcp.c nvme_opal.c nvme_io_msg.c nvme_poll_group.c diff --git a/lib/nvme/nvme.c b/lib/nvme/nvme.c index 9393810a6b7..fb4de8d2bad 100644 --- a/lib/nvme/nvme.c +++ b/lib/nvme/nvme.c @@ -164,7 +164,7 @@ nvme_wait_for_completion(struct spdk_nvme_qpair *qpair, * \param qpair queue to poll * \param status completion status. The user must fill this structure with zeroes before calling * this function - * \param timeout_in_secs optional timeout + * \param timeout_in_usecs optional timeout * * \return 0 if command completed without error, * -EIO if command completed with error, @@ -176,13 +176,13 @@ nvme_wait_for_completion(struct spdk_nvme_qpair *qpair, int nvme_wait_for_completion_timeout(struct spdk_nvme_qpair *qpair, struct nvme_completion_poll_status *status, - uint64_t timeout_in_secs) + uint64_t timeout_in_usecs) { uint64_t timeout_tsc = 0; int rc = 0; - if (timeout_in_secs) { - timeout_tsc = spdk_get_ticks() + timeout_in_secs * spdk_get_ticks_hz(); + if (timeout_in_usecs) { + timeout_tsc = spdk_get_ticks() + timeout_in_usecs * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; } while (status->done == false) { @@ -895,8 +895,10 @@ spdk_nvme_trid_populate_transport(struct spdk_nvme_transport_id *trid, trstring = SPDK_NVME_TRANSPORT_NAME_TCP; break; case SPDK_NVME_TRANSPORT_CUSTOM: + trstring = SPDK_NVME_TRANSPORT_NAME_CUSTOM; + break; default: - SPDK_ERRLOG("don't use this for custom transports\n"); + SPDK_ERRLOG("no available transports\n"); assert(0); return; } diff --git a/lib/nvme/nvme_ctrlr.c b/lib/nvme/nvme_ctrlr.c index 79bd1caf676..967655d5415 100644 --- a/lib/nvme/nvme_ctrlr.c +++ b/lib/nvme/nvme_ctrlr.c @@ -206,6 +206,11 @@ spdk_nvme_ctrlr_get_default_ctrlr_opts(struct spdk_nvme_ctrlr_opts *opts, size_t if (FIELD_OK(admin_queue_size)) { opts->admin_queue_size = DEFAULT_ADMIN_QUEUE_SIZE; } + + if (FIELD_OK(fabrics_connect_timeout_us)) { + opts->fabrics_connect_timeout_us = NVME_FABRIC_CONNECT_COMMAND_TIMEOUT; + } + #undef FIELD_OK } @@ -318,7 +323,7 @@ static struct spdk_nvme_qpair * nvme_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_io_qpair_opts *opts) { - uint32_t qid; + int32_t qid; struct spdk_nvme_qpair *qpair; union spdk_nvme_cc_register cc; @@ -348,12 +353,8 @@ nvme_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, return NULL; } - /* - * Get the first available I/O queue ID. - */ - qid = spdk_bit_array_find_first_set(ctrlr->free_io_qids, 1); - if (qid > ctrlr->opts.num_io_queues) { - SPDK_ERRLOG("No free I/O queue IDs\n"); + qid = spdk_nvme_ctrlr_alloc_qid(ctrlr); + if (qid < 0) { nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); return NULL; } @@ -361,11 +362,11 @@ nvme_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, qpair = nvme_transport_ctrlr_create_io_qpair(ctrlr, qid, opts); if (qpair == NULL) { SPDK_ERRLOG("nvme_transport_ctrlr_create_io_qpair() failed\n"); + spdk_nvme_ctrlr_free_qid(ctrlr, qid); nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); return NULL; } - spdk_bit_array_clear(ctrlr->free_io_qids, qid); TAILQ_INSERT_TAIL(&ctrlr->active_io_qpairs, qpair, tailq); nvme_ctrlr_proc_add_io_qpair(qpair); @@ -452,6 +453,7 @@ spdk_nvme_ctrlr_alloc_io_qpair(struct spdk_nvme_ctrlr *ctrlr, rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair); if (rc != 0) { SPDK_ERRLOG("nvme_transport_ctrlr_connect_io_qpair() failed\n"); + TAILQ_REMOVE(&ctrlr->active_io_qpairs, qpair, tailq); nvme_transport_ctrlr_delete_io_qpair(ctrlr, qpair); return NULL; } @@ -463,6 +465,7 @@ int spdk_nvme_ctrlr_reconnect_io_qpair(struct spdk_nvme_qpair *qpair) { struct spdk_nvme_ctrlr *ctrlr; + enum nvme_qpair_state qpair_state; int rc; assert(qpair != NULL); @@ -471,23 +474,24 @@ spdk_nvme_ctrlr_reconnect_io_qpair(struct spdk_nvme_qpair *qpair) ctrlr = qpair->ctrlr; nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + qpair_state = nvme_qpair_get_state(qpair); if (ctrlr->is_removed) { rc = -ENODEV; goto out; } - if (ctrlr->is_resetting) { + if (ctrlr->is_resetting || qpair_state == NVME_QPAIR_DISCONNECTING) { rc = -EAGAIN; goto out; } - if (ctrlr->is_failed) { + if (ctrlr->is_failed || qpair_state == NVME_QPAIR_DESTROYING) { rc = -ENXIO; goto out; } - if (nvme_qpair_get_state(qpair) != NVME_QPAIR_DISCONNECTED) { + if (qpair_state != NVME_QPAIR_DISCONNECTED) { rc = 0; goto out; } @@ -561,13 +565,22 @@ spdk_nvme_ctrlr_free_io_qpair(struct spdk_nvme_qpair *qpair) /* Do not retry. */ nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING); - nvme_qpair_abort_reqs(qpair, 1); + + /* In the multi-process case, a process may call this function on a foreign + * I/O qpair (i.e. one that this process did not create) when that qpairs process + * exits unexpectedly. In that case, we must not try to abort any reqs associated + * with that qpair, since the callbacks will also be foreign to this process. + */ + if (qpair->active_proc == nvme_ctrlr_get_current_process(ctrlr)) { + nvme_qpair_abort_reqs(qpair, 1); + } + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); nvme_ctrlr_proc_remove_io_qpair(qpair); TAILQ_REMOVE(&ctrlr->active_io_qpairs, qpair, tailq); - spdk_bit_array_set(ctrlr->free_io_qids, qpair->id); + spdk_nvme_ctrlr_free_qid(ctrlr, qpair->id); if (nvme_transport_ctrlr_delete_io_qpair(ctrlr, qpair)) { nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); @@ -642,7 +655,7 @@ static int nvme_ctrlr_set_intel_support_log_pages(struct spdk_nvme_ctrlr *ctrlr) } if (nvme_wait_for_completion_timeout(ctrlr->adminq, status, - ctrlr->opts.admin_timeout_ms / 1000)) { + ctrlr->opts.admin_timeout_ms * 1000)) { spdk_free(log_page_directory); SPDK_WARNLOG("Intel log pages not supported on Intel drive!\n"); if (!status->timed_out) { @@ -673,6 +686,9 @@ nvme_ctrlr_set_supported_log_pages(struct spdk_nvme_ctrlr *ctrlr) if (ctrlr->cdata.vid == SPDK_PCI_VID_INTEL && !(ctrlr->quirks & NVME_INTEL_QUIRK_NO_LOG_PAGES)) { rc = nvme_ctrlr_set_intel_support_log_pages(ctrlr); } + if (ctrlr->cdata.cmic.ana_reporting) { + ctrlr->log_page_supported[SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS] = true; + } return rc; } @@ -727,7 +743,7 @@ nvme_ctrlr_set_arbitration_feature(struct spdk_nvme_ctrlr *ctrlr) } if (nvme_wait_for_completion_timeout(ctrlr->adminq, status, - ctrlr->opts.admin_timeout_ms / 1000)) { + ctrlr->opts.admin_timeout_ms * 1000)) { SPDK_ERRLOG("Timeout to set arbitration feature\n"); } @@ -901,7 +917,8 @@ nvme_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) if (!(ctrlr->cap.bits.css & (1u << ctrlr->opts.command_set))) { SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested I/O command set %u but supported mask is 0x%x\n", ctrlr->opts.command_set, ctrlr->cap.bits.css); - return -EINVAL; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Falling back to NVM. Assuming NVM is supported.\n"); + ctrlr->opts.command_set = SPDK_NVME_CC_CSS_NVM; } cc.bits.css = ctrlr->opts.command_set; @@ -1053,7 +1070,7 @@ nvme_ctrlr_set_state(struct spdk_nvme_ctrlr *ctrlr, enum nvme_ctrlr_state state, ctrlr->state_timeout_tsc = timeout_in_ticks + now_ticks; SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (timeout %" PRIu64 " ms)\n", - nvme_ctrlr_state_string(ctrlr->state), ctrlr->state_timeout_tsc); + nvme_ctrlr_state_string(ctrlr->state), timeout_in_ms); return; inf: SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (no timeout)\n", @@ -1156,12 +1173,28 @@ nvme_ctrlr_set_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr) return rc; } +static void +nvme_ctrlr_abort_queued_aborts(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_request *req, *tmp; + struct spdk_nvme_cpl cpl = {}; + + cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + cpl.status.sct = SPDK_NVME_SCT_GENERIC; + + STAILQ_FOREACH_SAFE(req, &ctrlr->queued_aborts, stailq, tmp) { + STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq); + + nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, &cpl); + nvme_free_request(req); + } +} + int spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr) { - int rc = 0; + int rc = 0, rc_tmp = 0; struct spdk_nvme_qpair *qpair; - struct nvme_request *req, *tmp; nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); @@ -1180,12 +1213,8 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr) SPDK_NOTICELOG("resetting controller\n"); - /* Free all of the queued abort requests */ - STAILQ_FOREACH_SAFE(req, &ctrlr->queued_aborts, stailq, tmp) { - STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq); - nvme_free_request(req); - ctrlr->outstanding_aborts--; - } + /* Abort all of the queued abort requests */ + nvme_ctrlr_abort_queued_aborts(ctrlr); nvme_transport_admin_qpair_abort_aers(ctrlr->adminq); @@ -1196,9 +1225,9 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr) ctrlr->adminq->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; nvme_transport_ctrlr_disconnect_qpair(ctrlr, ctrlr->adminq); - if (nvme_transport_ctrlr_connect_qpair(ctrlr, ctrlr->adminq) != 0) { + rc = nvme_transport_ctrlr_connect_qpair(ctrlr, ctrlr->adminq); + if (rc != 0) { SPDK_ERRLOG("Controller reinitialization failed.\n"); - rc = -1; goto out; } @@ -1227,9 +1256,10 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr) if (rc == 0 && ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { /* Reinitialize qpairs */ TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) { - if (nvme_transport_ctrlr_connect_qpair(ctrlr, qpair) != 0) { + rc_tmp = nvme_transport_ctrlr_connect_qpair(ctrlr, qpair); + if (rc_tmp != 0) { + rc = rc_tmp; qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; - rc = -1; continue; } } @@ -1343,7 +1373,7 @@ nvme_ctrlr_identify_done(void *arg, const struct spdk_nvme_cpl *cpl) SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport max_sges %u\n", ctrlr->max_sges); } - if (ctrlr->cdata.oacs.security) { + if (ctrlr->cdata.oacs.security && !(ctrlr->quirks & NVME_QUIRK_OACS_SECURITY)) { ctrlr->flags |= SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED; } @@ -1700,7 +1730,8 @@ nvme_ctrlr_identify_id_desc_namespaces(struct spdk_nvme_ctrlr *ctrlr) struct spdk_nvme_ns *ns; int rc; - if (ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) || + if ((ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) && + !(ctrlr->cap.bits.css & SPDK_NVME_CAP_CSS_IOCS)) || (ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) { SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n"); nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, @@ -1776,11 +1807,11 @@ nvme_ctrlr_set_num_queues_done(void *arg, const struct spdk_nvme_cpl *cpl) return; } - /* Initialize list of free I/O queue IDs. QID 0 is the admin queue. */ - spdk_bit_array_clear(ctrlr->free_io_qids, 0); + /* Initialize list of free I/O queue IDs. QID 0 is the admin queue (implicitly allocated). */ for (i = 1; i <= ctrlr->opts.num_io_queues; i++) { - spdk_bit_array_set(ctrlr->free_io_qids, i); + spdk_nvme_ctrlr_free_qid(ctrlr, i); } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONSTRUCT_NS, ctrlr->opts.admin_timeout_ms); } @@ -1815,7 +1846,7 @@ nvme_ctrlr_set_num_queues(struct spdk_nvme_ctrlr *ctrlr) static void nvme_ctrlr_set_keep_alive_timeout_done(void *arg, const struct spdk_nvme_cpl *cpl) { - uint32_t keep_alive_interval_ms; + uint32_t keep_alive_interval_us; struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; if (spdk_nvme_cpl_is_error(cpl)) { @@ -1838,16 +1869,20 @@ nvme_ctrlr_set_keep_alive_timeout_done(void *arg, const struct spdk_nvme_cpl *cp ctrlr->opts.keep_alive_timeout_ms = cpl->cdw0; } - keep_alive_interval_ms = ctrlr->opts.keep_alive_timeout_ms / 2; - if (keep_alive_interval_ms == 0) { - keep_alive_interval_ms = 1; - } - SPDK_DEBUGLOG(SPDK_LOG_NVME, "Sending keep alive every %u ms\n", keep_alive_interval_ms); + if (ctrlr->opts.keep_alive_timeout_ms == 0) { + ctrlr->keep_alive_interval_ticks = 0; + } else { + keep_alive_interval_us = ctrlr->opts.keep_alive_timeout_ms * 1000 / 2; - ctrlr->keep_alive_interval_ticks = (keep_alive_interval_ms * spdk_get_ticks_hz()) / UINT64_C(1000); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Sending keep alive every %u us\n", keep_alive_interval_us); + + ctrlr->keep_alive_interval_ticks = (keep_alive_interval_us * spdk_get_ticks_hz()) / + UINT64_C(1000000); + + /* Schedule the first Keep Alive to be sent as soon as possible. */ + ctrlr->next_keep_alive_tick = spdk_get_ticks(); + } - /* Schedule the first Keep Alive to be sent as soon as possible. */ - ctrlr->next_keep_alive_tick = spdk_get_ticks(); nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID, ctrlr->opts.admin_timeout_ms); } @@ -2189,6 +2224,9 @@ nvme_ctrlr_configure_aer(struct spdk_nvme_ctrlr *ctrlr) if (ctrlr->cdata.oaes.fw_activation_notices) { config.bits.fw_activation_notice = 1; } + if (ctrlr->cdata.oaes.ana_change_notices) { + config.bits.ana_change_notice = 1; + } } if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 3, 0) && ctrlr->cdata.lpa.telemetry) { config.bits.telemetry_log_notice = 1; @@ -2484,11 +2522,11 @@ nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr) if (nvme_ctrlr_get_cc(ctrlr, &cc) || nvme_ctrlr_get_csts(ctrlr, &csts)) { - if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE) { + if (!ctrlr->is_failed && ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE) { /* While a device is resetting, it may be unable to service MMIO reads * temporarily. Allow for this case. */ - SPDK_ERRLOG("Get registers failed while waiting for CSTS.RDY == 0\n"); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Get registers failed while waiting for CSTS.RDY == 0\n"); goto init_timeout; } SPDK_ERRLOG("Failed to read CC and CSTS in state %d\n", ctrlr->state); @@ -2832,6 +2870,8 @@ nvme_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) ctrlr->is_destructed = true; spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + + nvme_ctrlr_abort_queued_aborts(ctrlr); nvme_transport_admin_qpair_abort_aers(ctrlr->adminq); TAILQ_FOREACH_SAFE(qpair, &ctrlr->active_io_qpairs, tailq, tmp) { @@ -2872,22 +2912,22 @@ nvme_keep_alive_completion(void *cb_ctx, const struct spdk_nvme_cpl *cpl) * Check if we need to send a Keep Alive command. * Caller must hold ctrlr->ctrlr_lock. */ -static void +static int nvme_ctrlr_keep_alive(struct spdk_nvme_ctrlr *ctrlr) { uint64_t now; struct nvme_request *req; struct spdk_nvme_cmd *cmd; - int rc; + int rc = 0; now = spdk_get_ticks(); if (now < ctrlr->next_keep_alive_tick) { - return; + return rc; } req = nvme_allocate_request_null(ctrlr->adminq, nvme_keep_alive_completion, NULL); if (req == NULL) { - return; + return rc; } cmd = &req->cmd; @@ -2896,9 +2936,11 @@ nvme_ctrlr_keep_alive(struct spdk_nvme_ctrlr *ctrlr) rc = nvme_ctrlr_submit_admin_request(ctrlr, req); if (rc != 0) { SPDK_ERRLOG("Submitting Keep Alive failed\n"); + rc = -ENXIO; } ctrlr->next_keep_alive_tick = now + ctrlr->keep_alive_interval_ticks; + return rc; } int32_t @@ -2910,7 +2952,11 @@ spdk_nvme_ctrlr_process_admin_completions(struct spdk_nvme_ctrlr *ctrlr) nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); if (ctrlr->keep_alive_interval_ticks) { - nvme_ctrlr_keep_alive(ctrlr); + rc = nvme_ctrlr_keep_alive(ctrlr); + if (rc) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; + } } rc = nvme_io_msg_process(ctrlr); @@ -3532,6 +3578,34 @@ spdk_nvme_ctrlr_get_transport_id(struct spdk_nvme_ctrlr *ctrlr) return &ctrlr->trid; } +int32_t +spdk_nvme_ctrlr_alloc_qid(struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t qid; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + qid = spdk_bit_array_find_first_set(ctrlr->free_io_qids, 1); + if (qid > ctrlr->opts.num_io_queues) { + SPDK_ERRLOG("No free I/O queue IDs\n"); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -1; + } + + spdk_bit_array_clear(ctrlr->free_io_qids, qid); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return qid; +} + +void +spdk_nvme_ctrlr_free_qid(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid) +{ + assert(qid <= ctrlr->opts.num_io_queues); + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + spdk_bit_array_set(ctrlr->free_io_qids, qid); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + /* FIXME need to specify max number of iovs */ int spdk_nvme_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, diff --git a/lib/nvme/nvme_ctrlr_cmd.c b/lib/nvme/nvme_ctrlr_cmd.c index 37d878b29e4..9b16c8d6fe6 100644 --- a/lib/nvme/nvme_ctrlr_cmd.c +++ b/lib/nvme/nvme_ctrlr_cmd.c @@ -565,16 +565,15 @@ spdk_nvme_ctrlr_cmd_get_log_page(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page } static void -nvme_ctrlr_cmd_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) +nvme_ctrlr_retry_queued_abort(struct spdk_nvme_ctrlr *ctrlr) { - struct nvme_request *req, *next, *tmp; - struct spdk_nvme_ctrlr *ctrlr; - int rc; + struct nvme_request *next, *tmp; + int rc; - req = ctx; - ctrlr = (struct spdk_nvme_ctrlr *)req->user_buffer; + if (ctrlr->is_resetting || ctrlr->is_destructed) { + return; + } - ctrlr->outstanding_aborts--; STAILQ_FOREACH_SAFE(next, &ctrlr->queued_aborts, stailq, tmp) { STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq); ctrlr->outstanding_aborts++; @@ -585,13 +584,39 @@ nvme_ctrlr_cmd_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) next->cpl.status.sct = SPDK_NVME_SCT_GENERIC; next->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; next->cpl.status.dnr = 1; - nvme_complete_request(next->cb_fn, next->cb_arg, next->qpair, next, &req->cpl); + nvme_complete_request(next->cb_fn, next->cb_arg, next->qpair, next, &next->cpl); nvme_free_request(next); } else { /* If the first abort succeeds, stop iterating. */ break; } } +} + +static int +_nvme_ctrlr_submit_abort_request(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_request *req) +{ + /* ACL is a 0's based value. */ + if (ctrlr->outstanding_aborts >= ctrlr->cdata.acl + 1U) { + STAILQ_INSERT_TAIL(&ctrlr->queued_aborts, req, stailq); + return 0; + } else { + ctrlr->outstanding_aborts++; + return nvme_ctrlr_submit_admin_request(ctrlr, req); + } +} + +static void +nvme_ctrlr_cmd_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_request *req = ctx; + struct spdk_nvme_ctrlr *ctrlr; + + ctrlr = req->qpair->ctrlr; + + ctrlr->outstanding_aborts--; + nvme_ctrlr_retry_queued_abort(ctrlr); req->user_cb_fn(req->user_cb_arg, cpl); } @@ -603,12 +628,9 @@ spdk_nvme_ctrlr_cmd_abort(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair int rc; struct nvme_request *req; struct spdk_nvme_cmd *cmd; - uint16_t sqid; - if (qpair) { - sqid = qpair->id; - } else { - sqid = ctrlr->adminq->id; /* 0 */ + if (qpair == NULL) { + qpair = ctrlr->adminq; } nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); @@ -620,23 +642,182 @@ spdk_nvme_ctrlr_cmd_abort(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair req->cb_arg = req; req->user_cb_fn = cb_fn; req->user_cb_arg = cb_arg; - req->user_buffer = ctrlr; /* This is a hack to get to the ctrlr in the - * completion handler. */ cmd = &req->cmd; cmd->opc = SPDK_NVME_OPC_ABORT; - cmd->cdw10_bits.abort.sqid = sqid; + cmd->cdw10_bits.abort.sqid = qpair->id; cmd->cdw10_bits.abort.cid = cid; - if (ctrlr->outstanding_aborts >= ctrlr->cdata.acl) { - STAILQ_INSERT_TAIL(&ctrlr->queued_aborts, req, stailq); - rc = 0; + rc = _nvme_ctrlr_submit_abort_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +static void +nvme_complete_abort_request(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_request *req = ctx; + struct nvme_request *parent = req->parent; + struct spdk_nvme_ctrlr *ctrlr; + + ctrlr = req->qpair->ctrlr; + + ctrlr->outstanding_aborts--; + nvme_ctrlr_retry_queued_abort(ctrlr); + + nvme_request_remove_child(parent, req); + + if (!spdk_nvme_cpl_is_abort_success(cpl)) { + parent->parent_status.cdw0 |= 1U; + } + + if (parent->num_children == 0) { + nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair, + parent, &parent->parent_status); + nvme_free_request(parent); + } +} + +static int +nvme_request_add_abort(struct nvme_request *req, void *arg) +{ + struct nvme_request *parent = arg; + struct nvme_request *child; + void *cmd_cb_arg; + + cmd_cb_arg = parent->user_cb_arg; + + if (req->cb_arg != cmd_cb_arg && + (req->parent == NULL || req->parent->cb_arg != cmd_cb_arg)) { + return 0; + } + + child = nvme_allocate_request_null(parent->qpair->ctrlr->adminq, + nvme_complete_abort_request, NULL); + if (child == NULL) { + return -ENOMEM; + } + + child->cb_arg = child; + + child->cmd.opc = SPDK_NVME_OPC_ABORT; + /* Copy SQID from the parent. */ + child->cmd.cdw10_bits.abort.sqid = parent->cmd.cdw10_bits.abort.sqid; + child->cmd.cdw10_bits.abort.cid = req->cmd.cid; + + child->parent = parent; + + TAILQ_INSERT_TAIL(&parent->children, child, child_tailq); + parent->num_children++; + + return 0; +} + +int +spdk_nvme_ctrlr_cmd_abort_ext(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + void *cmd_cb_arg, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + int rc = 0; + struct nvme_request *parent, *child, *tmp; + bool child_failed = false; + int aborted = 0; + + if (cmd_cb_arg == NULL) { + return -EINVAL; + } + + pthread_mutex_lock(&ctrlr->ctrlr_lock); + + if (qpair == NULL) { + qpair = ctrlr->adminq; + } + + parent = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (parent == NULL) { + pthread_mutex_unlock(&ctrlr->ctrlr_lock); + + return -ENOMEM; + } + + TAILQ_INIT(&parent->children); + parent->num_children = 0; + + parent->cmd.opc = SPDK_NVME_OPC_ABORT; + memset(&parent->parent_status, 0, sizeof(struct spdk_nvme_cpl)); + + /* Hold SQID that the requests to abort are associated with. + * This will be copied to the children. + * + * CID is not set here because the parent is not submitted directly + * and CID is not determined until request to abort is found. + */ + parent->cmd.cdw10_bits.abort.sqid = qpair->id; + + /* This is used to find request to abort. */ + parent->user_cb_arg = cmd_cb_arg; + + /* Add an abort request for each outstanding request which has cmd_cb_arg + * as its callback context. + */ + rc = nvme_transport_qpair_iterate_requests(qpair, nvme_request_add_abort, parent); + if (rc != 0) { + /* Free abort requests already added. */ + child_failed = true; + } + + TAILQ_FOREACH_SAFE(child, &parent->children, child_tailq, tmp) { + if (spdk_likely(!child_failed)) { + rc = _nvme_ctrlr_submit_abort_request(ctrlr, child); + if (spdk_unlikely(rc != 0)) { + child_failed = true; + } + } else { + /* Free remaining abort requests. */ + nvme_request_remove_child(parent, child); + nvme_free_request(child); + } + } + + if (spdk_likely(!child_failed)) { + /* There is no error so far. Abort requests were submitted successfully + * or there was no outstanding request to abort. + * + * Hence abort queued requests which has cmd_cb_arg as its callback + * context next. + */ + aborted = nvme_qpair_abort_queued_reqs(qpair, cmd_cb_arg); + if (parent->num_children == 0) { + /* There was no outstanding request to abort. */ + if (aborted > 0) { + /* The queued requests were successfully aborted. Hence + * complete the parent request with success synchronously. + */ + nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair, + parent, &parent->parent_status); + nvme_free_request(parent); + } else { + /* There was no queued request to abort. */ + rc = -ENOENT; + } + } } else { - ctrlr->outstanding_aborts++; - rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + /* Failed to add or submit abort request. */ + if (parent->num_children != 0) { + /* Return success since we must wait for those children + * to complete but set the parent request to failure. + */ + parent->parent_status.cdw0 |= 1U; + rc = 0; + } } - nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + if (rc != 0) { + nvme_free_request(parent); + } + + pthread_mutex_unlock(&ctrlr->ctrlr_lock); return rc; } diff --git a/lib/nvme/nvme_cuse.c b/lib/nvme/nvme_cuse.c index 9a5ee1f0d9e..0dc4effb3a7 100644 --- a/lib/nvme/nvme_cuse.c +++ b/lib/nvme/nvme_cuse.c @@ -102,7 +102,8 @@ cuse_nvme_admin_cmd_cb(void *arg, const struct spdk_nvme_cpl *cpl) struct iovec out_iov[2]; struct spdk_nvme_cpl _cpl; - if (ctx->data_transfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + if (ctx->data_transfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER || + ctx->data_transfer == SPDK_NVME_DATA_NONE) { fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, NULL, 0); } else { memcpy(&_cpl, cpl, sizeof(struct spdk_nvme_cpl)); @@ -206,10 +207,6 @@ cuse_nvme_admin_cmd(fuse_req_t req, int cmd, void *arg, admin_cmd = (struct nvme_admin_cmd *)in_buf; switch (spdk_nvme_opc_get_data_transfer(admin_cmd->opcode)) { - case SPDK_NVME_DATA_NONE: - SPDK_ERRLOG("SPDK_NVME_DATA_NONE not implemented\n"); - fuse_reply_err(req, EINVAL); - return; case SPDK_NVME_DATA_HOST_TO_CONTROLLER: if (admin_cmd->addr != 0) { in_iov[1].iov_base = (void *)admin_cmd->addr; @@ -223,6 +220,7 @@ cuse_nvme_admin_cmd(fuse_req_t req, int cmd, void *arg, cuse_nvme_admin_cmd_send(req, admin_cmd, NULL); } return; + case SPDK_NVME_DATA_NONE: case SPDK_NVME_DATA_CONTROLLER_TO_HOST: if (out_bufsz == 0) { out_iov[0].iov_base = &((struct nvme_admin_cmd *)arg)->result; @@ -317,16 +315,13 @@ cuse_nvme_submit_io_write_cb(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void } static void -cuse_nvme_submit_io_write(fuse_req_t req, int cmd, void *arg, - struct fuse_file_info *fi, unsigned flags, +cuse_nvme_submit_io_write(struct cuse_device *cuse_device, fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, uint32_t block_size, const void *in_buf, size_t in_bufsz, size_t out_bufsz) { const struct nvme_user_io *user_io = in_buf; struct cuse_io_ctx *ctx; - struct spdk_nvme_ns *ns; - uint32_t block_size; int rc; - struct cuse_device *cuse_device = fuse_req_userdata(req); ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx)); if (!ctx) { @@ -336,10 +331,6 @@ cuse_nvme_submit_io_write(fuse_req_t req, int cmd, void *arg, } ctx->req = req; - - ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid); - block_size = spdk_nvme_ns_get_sector_size(ns); - ctx->lba = user_io->slba; ctx->lba_count = user_io->nblocks + 1; ctx->data_len = ctx->lba_count * block_size; @@ -399,16 +390,13 @@ cuse_nvme_submit_io_read_cb(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void * } static void -cuse_nvme_submit_io_read(fuse_req_t req, int cmd, void *arg, - struct fuse_file_info *fi, unsigned flags, +cuse_nvme_submit_io_read(struct cuse_device *cuse_device, fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, uint32_t block_size, const void *in_buf, size_t in_bufsz, size_t out_bufsz) { int rc; struct cuse_io_ctx *ctx; const struct nvme_user_io *user_io = in_buf; - struct cuse_device *cuse_device = fuse_req_userdata(req); - struct spdk_nvme_ns *ns; - uint32_t block_size; ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx)); if (!ctx) { @@ -419,10 +407,7 @@ cuse_nvme_submit_io_read(fuse_req_t req, int cmd, void *arg, ctx->req = req; ctx->lba = user_io->slba; - ctx->lba_count = user_io->nblocks; - - ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid); - block_size = spdk_nvme_ns_get_sector_size(ns); + ctx->lba_count = user_io->nblocks + 1; ctx->data_len = ctx->lba_count * block_size; ctx->data = spdk_zmalloc(ctx->data_len, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, @@ -450,6 +435,9 @@ cuse_nvme_submit_io(fuse_req_t req, int cmd, void *arg, { const struct nvme_user_io *user_io; struct iovec in_iov[2], out_iov; + struct cuse_device *cuse_device = fuse_req_userdata(req); + struct spdk_nvme_ns *ns; + uint32_t block_size; in_iov[0].iov_base = (void *)arg; in_iov[0].iov_len = sizeof(*user_io); @@ -460,29 +448,31 @@ cuse_nvme_submit_io(fuse_req_t req, int cmd, void *arg, user_io = in_buf; + ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid); + block_size = spdk_nvme_ns_get_sector_size(ns); + switch (user_io->opcode) { case SPDK_NVME_OPC_READ: out_iov.iov_base = (void *)user_io->addr; - out_iov.iov_len = (user_io->nblocks + 1) * 512; + out_iov.iov_len = (user_io->nblocks + 1) * block_size; if (out_bufsz == 0) { fuse_reply_ioctl_retry(req, in_iov, 1, &out_iov, 1); return; } - cuse_nvme_submit_io_read(req, cmd, arg, fi, flags, in_buf, - in_bufsz, out_bufsz); + cuse_nvme_submit_io_read(cuse_device, req, cmd, arg, fi, flags, + block_size, in_buf, in_bufsz, out_bufsz); break; case SPDK_NVME_OPC_WRITE: in_iov[1].iov_base = (void *)user_io->addr; - in_iov[1].iov_len = (user_io->nblocks + 1) * 512; + in_iov[1].iov_len = (user_io->nblocks + 1) * block_size; if (in_bufsz == sizeof(*user_io)) { fuse_reply_ioctl_retry(req, in_iov, 2, NULL, 0); return; } - cuse_nvme_submit_io_write(req, cmd, arg, fi, flags, in_buf, - in_bufsz, out_bufsz); - + cuse_nvme_submit_io_write(cuse_device, req, cmd, arg, fi, flags, + block_size, in_buf, in_bufsz, out_bufsz); break; default: SPDK_ERRLOG("SUBMIT_IO: opc:%d not valid\n", user_io->opcode); diff --git a/lib/nvme/nvme_fabric.c b/lib/nvme/nvme_fabric.c index 9fff20873dd..c3d4f88028c 100644 --- a/lib/nvme/nvme_fabric.c +++ b/lib/nvme/nvme_fabric.c @@ -454,7 +454,8 @@ nvme_fabric_qpair_connect(struct spdk_nvme_qpair *qpair, uint32_t num_entries) return rc; } - if (nvme_wait_for_completion(qpair, status)) { + /* If we time out, the qpair will abort the request upon destruction. */ + if (nvme_wait_for_completion_timeout(qpair, status, ctrlr->opts.fabrics_connect_timeout_us)) { SPDK_ERRLOG("Connect command failed\n"); spdk_free(nvmf_data); if (!status->timed_out) { diff --git a/lib/nvme/nvme_internal.h b/lib/nvme/nvme_internal.h index f4c0f988921..71d5df69bbf 100644 --- a/lib/nvme/nvme_internal.h +++ b/lib/nvme/nvme_internal.h @@ -145,6 +145,11 @@ extern pid_t g_spdk_nvme_pid; */ #define NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH 0x1000 +/** + * The SSD does not support OPAL even through it sets the security bit in OACS. + */ +#define NVME_QUIRK_OACS_SECURITY 0x2000 + #define NVME_MAX_ASYNC_EVENTS (8) #define NVME_MAX_ADMIN_TIMEOUT_IN_SECS (30) @@ -178,6 +183,14 @@ extern pid_t g_spdk_nvme_pid; sizeof(struct spdk_nvme_cmd), \ sizeof(struct spdk_nvme_cpl))) +/* Default timeout for fabrics connect commands. */ +#ifdef DEBUG +#define NVME_FABRIC_CONNECT_COMMAND_TIMEOUT 0 +#else +/* 500 millisecond timeout. */ +#define NVME_FABRIC_CONNECT_COMMAND_TIMEOUT 500000 +#endif + enum nvme_payload_type { NVME_PAYLOAD_TYPE_INVALID = 0, @@ -401,6 +414,7 @@ struct spdk_nvme_qpair { STAILQ_HEAD(, nvme_request) free_req; STAILQ_HEAD(, nvme_request) queued_req; + STAILQ_HEAD(, nvme_request) aborting_queued_req; /* List entry for spdk_nvme_transport_poll_group::qpairs */ STAILQ_ENTRY(spdk_nvme_qpair) poll_group_stailq; @@ -890,7 +904,7 @@ int nvme_wait_for_completion_robust_lock(struct spdk_nvme_qpair *qpair, pthread_mutex_t *robust_mutex); int nvme_wait_for_completion_timeout(struct spdk_nvme_qpair *qpair, struct nvme_completion_poll_status *status, - uint64_t timeout_in_secs); + uint64_t timeout_in_usecs); struct spdk_nvme_ctrlr_process *nvme_ctrlr_get_process(struct spdk_nvme_ctrlr *ctrlr, pid_t pid); @@ -928,6 +942,7 @@ void nvme_qpair_complete_error_reqs(struct spdk_nvme_qpair *qpair); int nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req); void nvme_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); +uint32_t nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, void *cmd_cb_arg); void nvme_qpair_resubmit_requests(struct spdk_nvme_qpair *qpair, uint32_t num_requests); int nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr); @@ -1192,6 +1207,10 @@ int nvme_transport_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nv int32_t nvme_transport_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions); void nvme_transport_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair); +int nvme_transport_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg); + struct spdk_nvme_transport_poll_group *nvme_transport_poll_group_create( const struct spdk_nvme_transport *transport); int nvme_transport_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, diff --git a/lib/nvme/nvme_ns.c b/lib/nvme/nvme_ns.c index 5d424e5c73c..09ae81c4c75 100644 --- a/lib/nvme/nvme_ns.c +++ b/lib/nvme/nvme_ns.c @@ -157,7 +157,8 @@ nvme_ctrlr_identify_id_desc(struct spdk_nvme_ns *ns) memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list)); - if (ns->ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) || + if ((ns->ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) && + !(ns->ctrlr->cap.bits.css & SPDK_NVME_CAP_CSS_IOCS)) || (ns->ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) { SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n"); return 0; @@ -351,13 +352,38 @@ spdk_nvme_ns_get_uuid(const struct spdk_nvme_ns *ns) size_t uuid_size; uuid = nvme_ns_find_id_desc(ns, SPDK_NVME_NIDT_UUID, &uuid_size); - if (uuid == NULL || uuid_size != sizeof(*uuid)) { + if (uuid && uuid_size != sizeof(*uuid)) { + SPDK_WARNLOG("Invalid NIDT_UUID descriptor length reported: %zu (expected: %zu)\n", + uuid_size, sizeof(*uuid)); return NULL; } return uuid; } +enum spdk_nvme_csi +spdk_nvme_ns_get_csi(const struct spdk_nvme_ns *ns) { + const uint8_t *csi; + size_t csi_size; + + csi = nvme_ns_find_id_desc(ns, SPDK_NVME_NIDT_CSI, &csi_size); + if (csi && csi_size != sizeof(*csi)) + { + SPDK_WARNLOG("Invalid NIDT_CSI descriptor length reported: %zu (expected: %zu)\n", + csi_size, sizeof(*csi)); + return SPDK_NVME_CSI_NVM; + } + if (!csi) + { + if (ns->ctrlr->cap.bits.css & SPDK_NVME_CAP_CSS_IOCS) { + SPDK_WARNLOG("CSI not reported for NSID: %" PRIu32 "\n", ns->id); + } + return SPDK_NVME_CSI_NVM; + } + + return *csi; +} + int nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id, struct spdk_nvme_ctrlr *ctrlr) { @@ -373,6 +399,11 @@ int nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id, return rc; } + /* skip Identify NS ID Descriptor List for inactive NS */ + if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, id)) { + return 0; + } + return nvme_ctrlr_identify_id_desc(ns); } diff --git a/lib/nvme/nvme_pcie.c b/lib/nvme/nvme_pcie.c index 5c710efadee..ef00f804ab4 100644 --- a/lib/nvme/nvme_pcie.c +++ b/lib/nvme/nvme_pcie.c @@ -269,8 +269,6 @@ _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx) struct spdk_nvme_ctrlr *ctrlr, *tmp; struct spdk_uevent event; struct spdk_pci_addr pci_addr; - union spdk_nvme_csts_register csts; - struct spdk_nvme_ctrlr_process *proc; if (g_spdk_nvme_driver->hotplug_fd < 0) { return 0; @@ -313,25 +311,20 @@ _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx) } } - /* This is a work around for vfio-attached device hot remove detection. */ + /* Initiate removal of physically hotremoved PCI controllers. Even after + * they're hotremoved from the system, SPDK might still report them via RPC. + */ TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) { bool do_remove = false; + struct nvme_pcie_ctrlr *pctrlr; - if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { - struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); - - if (spdk_pci_device_is_removed(pctrlr->devhandle)) { - do_remove = true; - } + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + continue; } - /* NVMe controller BAR must be mapped in the current process before any access. */ - proc = nvme_ctrlr_get_current_process(ctrlr); - if (proc) { - csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); - if (csts.raw == 0xffffffffU) { - do_remove = true; - } + pctrlr = nvme_pcie_ctrlr(ctrlr); + if (spdk_pci_device_is_removed(pctrlr->devhandle)) { + do_remove = true; } if (do_remove) { @@ -620,8 +613,6 @@ nvme_pcie_ctrlr_map_io_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size) VALUE_2MB - 1); mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset + pctrlr->cmb.size); - pctrlr->cmb.mem_register_addr = (void *)mem_register_start; - pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start; rc = spdk_mem_register((void *)mem_register_start, mem_register_end - mem_register_start); if (rc) { @@ -1153,6 +1144,22 @@ nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair, return 0; } +/* Used when dst points to MMIO (i.e. CMB) in a virtual machine - in these cases we must + * not use wide instructions because QEMU will not emulate such instructions to MMIO space. + * So this function ensures we only copy 8 bytes at a time. + */ +static inline void +nvme_pcie_copy_command_mmio(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) +{ + uint64_t *dst64 = (uint64_t *)dst; + const uint64_t *src64 = (const uint64_t *)src; + uint32_t i; + + for (i = 0; i < sizeof(*dst) / 8; i++) { + dst64[i] = src64[i]; + } +} + static inline void nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) { @@ -1336,7 +1343,7 @@ nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracke * virtual NVMe controller, the maximum access width is 8 Bytes for one time. */ if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH) && pqpair->sq_in_cmb)) { - pqpair->cmd[pqpair->sq_tail] = req->cmd; + nvme_pcie_copy_command_mmio(&pqpair->cmd[pqpair->sq_tail], &req->cmd); } else { /* Copy the command from the tracker to the submission queue. */ nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd); @@ -1383,6 +1390,8 @@ nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_trac req->retries++; nvme_pcie_qpair_submit_tracker(qpair, tr); } else { + TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list); + /* Only check admin requests from different processes. */ if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) { req_from_current_proc = false; @@ -1397,7 +1406,6 @@ nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_trac tr->req = NULL; - TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list); TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); } } @@ -1440,6 +1448,29 @@ nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr) } } +static int +nvme_pcie_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr, *tmp; + int rc; + + assert(iter_fn != NULL); + + TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) { + assert(tr->req != NULL); + + rc = iter_fn(tr->req, arg); + if (rc != 0) { + return rc; + } + } + + return 0; +} + static void nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) { @@ -1729,6 +1760,9 @@ nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme { } +static int32_t nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, + uint32_t max_completions); + static int nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) { @@ -1761,6 +1795,11 @@ nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ return -1; } + /* Now that the submission queue is deleted, the device is supposed to have + * completed any outstanding I/O. Try to complete them. If they don't complete, + * they'll be marked as aborted and completed below. */ + nvme_pcie_qpair_process_completions(qpair, 0); + memset(status, 0, sizeof(*status)); /* Delete the completion queue */ rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status); @@ -2513,6 +2552,20 @@ nvme_pcie_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) return 0; } +static struct spdk_pci_id nvme_pci_driver_id[] = { + { + .class_id = SPDK_PCI_CLASS_NVME, + .vendor_id = SPDK_PCI_ANY_ID, + .device_id = SPDK_PCI_ANY_ID, + .subvendor_id = SPDK_PCI_ANY_ID, + .subdevice_id = SPDK_PCI_ANY_ID, + }, + { .vendor_id = 0, /* sentinel */ }, +}; + +SPDK_PCI_DRIVER_REGISTER(nvme, nvme_pci_driver_id, + SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE); + const struct spdk_nvme_transport_ops pcie_ops = { .name = "PCIE", .type = SPDK_NVME_TRANSPORT_PCIE, @@ -2542,6 +2595,7 @@ const struct spdk_nvme_transport_ops pcie_ops = { .qpair_reset = nvme_pcie_qpair_reset, .qpair_submit_request = nvme_pcie_qpair_submit_request, .qpair_process_completions = nvme_pcie_qpair_process_completions, + .qpair_iterate_requests = nvme_pcie_qpair_iterate_requests, .admin_qpair_abort_aers = nvme_pcie_admin_qpair_abort_aers, .poll_group_create = nvme_pcie_poll_group_create, diff --git a/lib/nvme/nvme_qpair.c b/lib/nvme/nvme_qpair.c index 433e41f8c73..2303b916be9 100644 --- a/lib/nvme/nvme_qpair.c +++ b/lib/nvme/nvme_qpair.c @@ -34,6 +34,8 @@ #include "nvme_internal.h" #include "spdk/nvme_ocssd.h" +#define NVME_CMD_DPTR_STR_SIZE 256 + static int nvme_qpair_resubmit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req); struct nvme_string { @@ -64,14 +66,50 @@ static const struct nvme_string admin_opcode[] = { { SPDK_NVME_OPC_NVME_MI_SEND, "NVME-MI SEND" }, { SPDK_NVME_OPC_NVME_MI_RECEIVE, "NVME-MI RECEIVE" }, { SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG, "DOORBELL BUFFER CONFIG" }, + { SPDK_NVME_OPC_FABRIC, "FABRIC" }, { SPDK_NVME_OPC_FORMAT_NVM, "FORMAT NVM" }, { SPDK_NVME_OPC_SECURITY_SEND, "SECURITY SEND" }, { SPDK_NVME_OPC_SECURITY_RECEIVE, "SECURITY RECEIVE" }, { SPDK_NVME_OPC_SANITIZE, "SANITIZE" }, + { SPDK_NVME_OPC_GET_LBA_STATUS, "GET LBA STATUS" }, { SPDK_OCSSD_OPC_GEOMETRY, "OCSSD / GEOMETRY" }, { 0xFFFF, "ADMIN COMMAND" } }; +static const struct nvme_string fabric_opcode[] = { + { SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET, "PROPERTY SET" }, + { SPDK_NVMF_FABRIC_COMMAND_CONNECT, "CONNECT" }, + { SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET, "PROPERTY GET" }, + { SPDK_NVMF_FABRIC_COMMAND_AUTHENTICATION_SEND, "AUTHENTICATION SEND" }, + { SPDK_NVMF_FABRIC_COMMAND_AUTHENTICATION_RECV, "AUTHENTICATION RECV" }, + { 0xFFFF, "RESERVED / VENDOR SPECIFIC" } +}; + +static const struct nvme_string feat_opcode[] = { + { SPDK_NVME_FEAT_ARBITRATION, "ARBITRATION" }, + { SPDK_NVME_FEAT_POWER_MANAGEMENT, "POWER MANAGEMENT" }, + { SPDK_NVME_FEAT_LBA_RANGE_TYPE, "LBA RANGE TYPE" }, + { SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD, "TEMPERATURE THRESHOLD" }, + { SPDK_NVME_FEAT_ERROR_RECOVERY, "ERROR_RECOVERY" }, + { SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE, "VOLATILE WRITE CACHE" }, + { SPDK_NVME_FEAT_NUMBER_OF_QUEUES, "NUMBER OF QUEUES" }, + { SPDK_NVME_FEAT_INTERRUPT_COALESCING, "INTERRUPT COALESCING" }, + { SPDK_NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION, "INTERRUPT VECTOR CONFIGURATION" }, + { SPDK_NVME_FEAT_WRITE_ATOMICITY, "WRITE ATOMICITY" }, + { SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION, "ASYNC EVENT CONFIGURATION" }, + { SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION, "AUTONOMOUS POWER STATE TRANSITION" }, + { SPDK_NVME_FEAT_HOST_MEM_BUFFER, "HOST MEM BUFFER" }, + { SPDK_NVME_FEAT_TIMESTAMP, "TIMESTAMP" }, + { SPDK_NVME_FEAT_KEEP_ALIVE_TIMER, "KEEP ALIVE TIMER" }, + { SPDK_NVME_FEAT_HOST_CONTROLLED_THERMAL_MANAGEMENT, "HOST CONTROLLED THERMAL MANAGEMENT" }, + { SPDK_NVME_FEAT_NON_OPERATIONAL_POWER_STATE_CONFIG, "NON OPERATIONAL POWER STATE CONFIG" }, + { SPDK_NVME_FEAT_SOFTWARE_PROGRESS_MARKER, "SOFTWARE PROGRESS MARKER" }, + { SPDK_NVME_FEAT_HOST_IDENTIFIER, "HOST IDENTIFIER" }, + { SPDK_NVME_FEAT_HOST_RESERVE_MASK, "HOST RESERVE MASK" }, + { SPDK_NVME_FEAT_HOST_RESERVE_PERSIST, "HOST RESERVE PERSIST" }, + { 0xFFFF, "RESERVED" } +}; + static const struct nvme_string io_opcode[] = { { SPDK_NVME_OPC_FLUSH, "FLUSH" }, { SPDK_NVME_OPC_WRITE, "WRITE" }, @@ -91,6 +129,24 @@ static const struct nvme_string io_opcode[] = { { 0xFFFF, "IO COMMAND" } }; +static const struct nvme_string sgl_type[] = { + { SPDK_NVME_SGL_TYPE_DATA_BLOCK, "DATA BLOCK" }, + { SPDK_NVME_SGL_TYPE_BIT_BUCKET, "BIT BUCKET" }, + { SPDK_NVME_SGL_TYPE_SEGMENT, "SEGMENT" }, + { SPDK_NVME_SGL_TYPE_LAST_SEGMENT, "LAST SEGMENT" }, + { SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK, "TRANSPORT DATA BLOCK" }, + { SPDK_NVME_SGL_TYPE_VENDOR_SPECIFIC, "VENDOR SPECIFIC" }, + { 0xFFFF, "RESERVED" } +}; + +static const struct nvme_string sgl_subtype[] = { + { SPDK_NVME_SGL_SUBTYPE_ADDRESS, "ADDRESS" }, + { SPDK_NVME_SGL_SUBTYPE_OFFSET, "OFFSET" }, + { SPDK_NVME_SGL_SUBTYPE_TRANSPORT, "TRANSPORT" }, + { SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY, "INVALIDATE KEY" }, + { 0xFFFF, "RESERVED" } +}; + static const char * nvme_get_string(const struct nvme_string *strings, uint16_t value) { @@ -108,61 +164,146 @@ nvme_get_string(const struct nvme_string *strings, uint16_t value) } static void -nvme_admin_qpair_print_command(struct spdk_nvme_qpair *qpair, - struct spdk_nvme_cmd *cmd) +nvme_get_sgl_unkeyed(char *buf, size_t size, struct spdk_nvme_cmd *cmd) { + struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; - SPDK_NOTICELOG("%s (%02x) sqid:%d cid:%d nsid:%x " - "cdw10:%08x cdw11:%08x\n", - nvme_get_string(admin_opcode, cmd->opc), cmd->opc, qpair->id, cmd->cid, - cmd->nsid, cmd->cdw10, cmd->cdw11); + snprintf(buf, size, " len:0x%x", sgl->unkeyed.length); } static void -nvme_io_qpair_print_command(struct spdk_nvme_qpair *qpair, - struct spdk_nvme_cmd *cmd) +nvme_get_sgl_keyed(char *buf, size_t size, struct spdk_nvme_cmd *cmd) { - assert(qpair != NULL); + struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; + + snprintf(buf, size, " len:0x%x key:0x%x", sgl->keyed.length, sgl->keyed.key); +} + +static void +nvme_get_sgl(char *buf, size_t size, struct spdk_nvme_cmd *cmd) +{ + struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; + int c; + + c = snprintf(buf, size, "SGL %s %s 0x%" PRIx64, nvme_get_string(sgl_type, sgl->generic.type), + nvme_get_string(sgl_subtype, sgl->generic.subtype), sgl->address); + assert(c >= 0 && (size_t)c < size); + + if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK) { + nvme_get_sgl_unkeyed(buf + c, size - c, cmd); + } + + if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { + nvme_get_sgl_keyed(buf + c, size - c, cmd); + } +} + +static void +nvme_get_prp(char *buf, size_t size, struct spdk_nvme_cmd *cmd) +{ + snprintf(buf, size, "PRP1 0x%" PRIx64 " PRP2 0x%" PRIx64, cmd->dptr.prp.prp1, cmd->dptr.prp.prp2); +} + +static void +nvme_get_dptr(char *buf, size_t size, struct spdk_nvme_cmd *cmd) +{ + if (spdk_nvme_opc_get_data_transfer(cmd->opc) != SPDK_NVME_DATA_NONE) { + switch (cmd->psdt) { + case SPDK_NVME_PSDT_PRP: + nvme_get_prp(buf, size, cmd); + break; + case SPDK_NVME_PSDT_SGL_MPTR_CONTIG: + case SPDK_NVME_PSDT_SGL_MPTR_SGL: + nvme_get_sgl(buf, size, cmd); + break; + default: + ; + } + } +} + +static void +nvme_admin_qpair_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd) +{ + struct spdk_nvmf_capsule_cmd *fcmd = (void *)cmd; + char dptr[NVME_CMD_DPTR_STR_SIZE] = {'\0'}; + + assert(cmd != NULL); + + nvme_get_dptr(dptr, sizeof(dptr), cmd); + + switch ((int)cmd->opc) { + case SPDK_NVME_OPC_SET_FEATURES: + case SPDK_NVME_OPC_GET_FEATURES: + SPDK_NOTICELOG("%s %s cid:%d cdw10:%08x %s\n", + nvme_get_string(admin_opcode, cmd->opc), nvme_get_string(feat_opcode, + cmd->cdw10_bits.set_features.fid), cmd->cid, cmd->cdw10, dptr); + break; + case SPDK_NVME_OPC_FABRIC: + SPDK_NOTICELOG("%s %s qid:%d cid:%d %s\n", + nvme_get_string(admin_opcode, cmd->opc), nvme_get_string(fabric_opcode, fcmd->fctype), qid, + fcmd->cid, dptr); + break; + default: + SPDK_NOTICELOG("%s (%02x) qid:%d cid:%d nsid:%x cdw10:%08x cdw11:%08x %s\n", + nvme_get_string(admin_opcode, cmd->opc), cmd->opc, qid, cmd->cid, cmd->nsid, cmd->cdw10, + cmd->cdw11, dptr); + } +} + +static void +nvme_io_qpair_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd) +{ + char dptr[NVME_CMD_DPTR_STR_SIZE] = {'\0'}; + assert(cmd != NULL); + + nvme_get_dptr(dptr, sizeof(dptr), cmd); + switch ((int)cmd->opc) { case SPDK_NVME_OPC_WRITE: case SPDK_NVME_OPC_READ: case SPDK_NVME_OPC_WRITE_UNCORRECTABLE: case SPDK_NVME_OPC_COMPARE: SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d " - "lba:%llu len:%d\n", - nvme_get_string(io_opcode, cmd->opc), qpair->id, cmd->cid, - cmd->nsid, + "lba:%llu len:%d %s\n", + nvme_get_string(io_opcode, cmd->opc), qid, cmd->cid, cmd->nsid, ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10, - (cmd->cdw12 & 0xFFFF) + 1); + (cmd->cdw12 & 0xFFFF) + 1, dptr); break; case SPDK_NVME_OPC_FLUSH: case SPDK_NVME_OPC_DATASET_MANAGEMENT: SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d\n", - nvme_get_string(io_opcode, cmd->opc), qpair->id, cmd->cid, - cmd->nsid); + nvme_get_string(io_opcode, cmd->opc), qid, cmd->cid, cmd->nsid); break; default: SPDK_NOTICELOG("%s (%02x) sqid:%d cid:%d nsid:%d\n", - nvme_get_string(io_opcode, cmd->opc), cmd->opc, qpair->id, - cmd->cid, cmd->nsid); + nvme_get_string(io_opcode, cmd->opc), cmd->opc, qid, cmd->cid, cmd->nsid); break; } } void -spdk_nvme_qpair_print_command(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cmd *cmd) +spdk_nvme_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd) { - assert(qpair != NULL); assert(cmd != NULL); - if (nvme_qpair_is_admin_queue(qpair)) { - nvme_admin_qpair_print_command(qpair, cmd); + if (qid == 0 || cmd->opc == SPDK_NVME_OPC_FABRIC) { + nvme_admin_qpair_print_command(qid, cmd); } else { - nvme_io_qpair_print_command(qpair, cmd); + nvme_io_qpair_print_command(qid, cmd); } } +void +spdk_nvme_qpair_print_command(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cmd *cmd) +{ + assert(qpair != NULL); + assert(cmd != NULL); + + spdk_nvme_print_command(qpair->id, cmd); +} + static const struct nvme_string generic_status[] = { { SPDK_NVME_SC_SUCCESS, "SUCCESS" }, { SPDK_NVME_SC_INVALID_OPCODE, "INVALID OPCODE" }, @@ -297,15 +438,28 @@ spdk_nvme_cpl_get_status_string(const struct spdk_nvme_status *status) } void -spdk_nvme_qpair_print_completion(struct spdk_nvme_qpair *qpair, - struct spdk_nvme_cpl *cpl) +spdk_nvme_print_completion(uint16_t qid, struct spdk_nvme_cpl *cpl) { - SPDK_NOTICELOG("%s (%02x/%02x) sqid:%d cid:%d cdw0:%x sqhd:%04x p:%x m:%x dnr:%x\n", + assert(cpl != NULL); + + /* Check that sqid matches qid. Note that sqid is reserved + * for fabrics so don't print an error when sqid is 0. */ + if (cpl->sqid != qid && cpl->sqid != 0) { + SPDK_ERRLOG("sqid %u doesn't match qid\n", cpl->sqid); + } + + SPDK_NOTICELOG("%s (%02x/%02x) qid:%d cid:%d cdw0:%x sqhd:%04x p:%x m:%x dnr:%x\n", spdk_nvme_cpl_get_status_string(&cpl->status), - cpl->status.sct, cpl->status.sc, cpl->sqid, cpl->cid, cpl->cdw0, + cpl->status.sct, cpl->status.sc, qid, cpl->cid, cpl->cdw0, cpl->sqhd, cpl->status.p, cpl->status.m, cpl->status.dnr); } +void +spdk_nvme_qpair_print_completion(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cpl *cpl) +{ + spdk_nvme_print_completion(qpair->id, cpl); +} + bool nvme_completion_is_retry(const struct spdk_nvme_cpl *cpl) { @@ -388,7 +542,7 @@ nvme_qpair_manual_complete_request(struct spdk_nvme_qpair *qpair, } static void -nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +_nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) { struct nvme_request *req; @@ -403,6 +557,44 @@ nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) } } +/* The callback to a request may submit the next request which is queued and + * then the same callback may abort it immediately. This repetition may cause + * infinite recursive calls. Hence move aborting requests to another list here + * and abort them later at resubmission. + */ +static void +_nvme_qpair_complete_abort_queued_reqs(struct spdk_nvme_qpair *qpair) +{ + struct nvme_request *req; + + while (!STAILQ_EMPTY(&qpair->aborting_queued_req)) { + req = STAILQ_FIRST(&qpair->aborting_queued_req); + STAILQ_REMOVE_HEAD(&qpair->aborting_queued_req, stailq); + nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_ABORTED_BY_REQUEST, 1, true); + } +} + +uint32_t +nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, void *cmd_cb_arg) +{ + struct nvme_request *req, *tmp; + uint32_t aborting = 0; + + STAILQ_FOREACH_SAFE(req, &qpair->queued_req, stailq, tmp) { + if (req->cb_arg == cmd_cb_arg) { + STAILQ_REMOVE(&qpair->queued_req, req, nvme_request, stailq); + STAILQ_INSERT_TAIL(&qpair->aborting_queued_req, req, stailq); + if (!qpair->ctrlr->opts.disable_error_logging) { + SPDK_ERRLOG("aborting queued i/o\n"); + } + aborting++; + } + } + + return aborting; +} + static inline bool nvme_qpair_check_enabled(struct spdk_nvme_qpair *qpair) { @@ -475,6 +667,8 @@ nvme_qpair_resubmit_requests(struct spdk_nvme_qpair *qpair, uint32_t num_request break; } } + + _nvme_qpair_complete_abort_queued_reqs(qpair); } int32_t @@ -566,6 +760,7 @@ nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id, STAILQ_INIT(&qpair->free_req); STAILQ_INIT(&qpair->queued_req); + STAILQ_INIT(&qpair->aborting_queued_req); TAILQ_INIT(&qpair->err_cmd_head); STAILQ_INIT(&qpair->err_req_head); @@ -608,7 +803,8 @@ nvme_qpair_deinit(struct spdk_nvme_qpair *qpair) { struct nvme_error_cmd *cmd, *entry; - nvme_qpair_abort_queued_reqs(qpair, 1); + _nvme_qpair_abort_queued_reqs(qpair, 1); + _nvme_qpair_complete_abort_queued_reqs(qpair); nvme_qpair_complete_error_reqs(qpair); TAILQ_FOREACH_SAFE(cmd, &qpair->err_cmd_head, link, entry) { @@ -630,6 +826,21 @@ _nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *r nvme_qpair_check_enabled(qpair); + if (spdk_unlikely(nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTED || + nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING || + nvme_qpair_get_state(qpair) == NVME_QPAIR_DESTROYING)) { + TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) { + nvme_request_remove_child(req, child_req); + nvme_request_free_children(child_req); + nvme_free_request(child_req); + } + if (req->parent != NULL) { + nvme_request_remove_child(req->parent, req); + } + nvme_free_request(req); + return -ENXIO; + } + if (req->num_children) { /* * This is a split (parent) request. Submit all of the children but not the parent @@ -650,9 +861,12 @@ _nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *r if (spdk_unlikely(child_req_failed)) { /* part of children requests have been submitted, - * return success for this case. + * return success since we must wait for those children to complete, + * but set the parent request to failure. */ if (req->num_children) { + req->cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; return 0; } goto error; @@ -743,12 +957,6 @@ nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *re { int rc; - /* This prevents us from entering an infinite loop when freeing queued I/O in disconnect. */ - if (spdk_unlikely(nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING || - nvme_qpair_get_state(qpair) == NVME_QPAIR_DESTROYING)) { - return -ENXIO; - } - if (spdk_unlikely(!STAILQ_EMPTY(&qpair->queued_req) && req->num_children == 0)) { /* * requests that have no children should be sent to the transport after all @@ -794,7 +1002,8 @@ void nvme_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) { nvme_qpair_complete_error_reqs(qpair); - nvme_qpair_abort_queued_reqs(qpair, dnr); + _nvme_qpair_abort_queued_reqs(qpair, dnr); + _nvme_qpair_complete_abort_queued_reqs(qpair); nvme_transport_qpair_abort_reqs(qpair, dnr); } diff --git a/lib/nvme/nvme_quirks.c b/lib/nvme/nvme_quirks.c index fa4326349d5..38c8f0eaeb8 100644 --- a/lib/nvme/nvme_quirks.c +++ b/lib/nvme/nvme_quirks.c @@ -39,7 +39,7 @@ struct nvme_quirk { }; static const struct nvme_quirk nvme_quirks[] = { - { {SPDK_PCI_VID_INTEL, 0x0953, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0953, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, NVME_INTEL_QUIRK_READ_LATENCY | NVME_INTEL_QUIRK_WRITE_LATENCY | NVME_INTEL_QUIRK_STRIPING | @@ -47,7 +47,7 @@ static const struct nvme_quirk nvme_quirks[] = { NVME_QUIRK_DELAY_BEFORE_INIT | NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE }, - { {SPDK_PCI_VID_INTEL, 0x0A53, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A53, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, NVME_INTEL_QUIRK_READ_LATENCY | NVME_INTEL_QUIRK_WRITE_LATENCY | NVME_INTEL_QUIRK_STRIPING | @@ -55,52 +55,56 @@ static const struct nvme_quirk nvme_quirks[] = { NVME_QUIRK_DELAY_BEFORE_INIT | NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE }, - { {SPDK_PCI_VID_INTEL, 0x0A54, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A54, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, NVME_INTEL_QUIRK_READ_LATENCY | NVME_INTEL_QUIRK_WRITE_LATENCY | NVME_INTEL_QUIRK_STRIPING | NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE | NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE }, - { {SPDK_PCI_VID_INTEL, 0x0A55, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A55, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, NVME_INTEL_QUIRK_READ_LATENCY | NVME_INTEL_QUIRK_WRITE_LATENCY | NVME_INTEL_QUIRK_STRIPING | NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE | NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE }, - { {SPDK_PCI_VID_MEMBLAZE, 0x0540, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_MEMBLAZE, 0x0540, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, NVME_QUIRK_DELAY_BEFORE_CHK_RDY }, - { {SPDK_PCI_VID_SAMSUNG, 0xa821, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_SAMSUNG, 0xa821, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, NVME_QUIRK_DELAY_BEFORE_CHK_RDY }, - { {SPDK_PCI_VID_SAMSUNG, 0xa822, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_SAMSUNG, 0xa822, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, NVME_QUIRK_DELAY_BEFORE_CHK_RDY }, - { {SPDK_PCI_VID_VIRTUALBOX, 0x4e56, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_VIRTUALBOX, 0x4e56, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC }, - { {SPDK_PCI_VID_INTEL, 0x5845, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x5845, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, NVME_QUIRK_IDENTIFY_CNS | NVME_INTEL_QUIRK_NO_LOG_PAGES | NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH }, - { {SPDK_PCI_VID_CNEXLABS, 0x1f1f, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_CNEXLABS, 0x1f1f, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, NVME_QUIRK_IDENTIFY_CNS | NVME_QUIRK_OCSSD }, - { {SPDK_PCI_VID_VMWARE, 0x07f0, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_VMWARE, 0x07f0, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, NVME_QUIRK_SHST_COMPLETE }, - { {0x0000, 0x0000, 0x0000, 0x0000}, 0} + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x2700, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_OACS_SECURITY + }, + { {0x000000, 0x0000, 0x0000, 0x0000, 0x0000}, 0} }; /* Compare each field. SPDK_PCI_ANY_ID in s1 matches everything */ static bool pci_id_match(const struct spdk_pci_id *s1, const struct spdk_pci_id *s2) { - if ((s1->vendor_id == SPDK_PCI_ANY_ID || s1->vendor_id == s2->vendor_id) && + if ((s1->class_id == SPDK_PCI_CLASS_ANY_ID || s1->class_id == s2->class_id) && + (s1->vendor_id == SPDK_PCI_ANY_ID || s1->vendor_id == s2->vendor_id) && (s1->device_id == SPDK_PCI_ANY_ID || s1->device_id == s2->device_id) && (s1->subvendor_id == SPDK_PCI_ANY_ID || s1->subvendor_id == s2->subvendor_id) && (s1->subdevice_id == SPDK_PCI_ANY_ID || s1->subdevice_id == s2->subdevice_id)) { diff --git a/lib/nvme/nvme_rdma.c b/lib/nvme/nvme_rdma.c index 5c9895712e6..af1e5232d99 100644 --- a/lib/nvme/nvme_rdma.c +++ b/lib/nvme/nvme_rdma.c @@ -95,6 +95,11 @@ */ #define NVME_RDMA_DESTROYED_QPAIR_EXPIRATION_CYCLES 50 +/* + * The max length of keyed SGL data block (3 bytes) + */ +#define NVME_RDMA_MAX_KEYED_SGL_LENGTH ((1u << 24u) - 1) + #define WC_PER_QPAIR(queue_depth) (queue_depth * 2) enum nvme_rdma_wr_type { @@ -166,11 +171,6 @@ struct nvme_rdma_poll_group { STAILQ_HEAD(, nvme_rdma_destroyed_qpair) destroyed_qpairs; }; -struct spdk_nvme_send_wr_list { - struct ibv_send_wr *first; - struct ibv_send_wr *last; -}; - struct spdk_nvme_recv_wr_list { struct ibv_recv_wr *first; struct ibv_recv_wr *last; @@ -210,7 +210,6 @@ struct nvme_rdma_qpair { struct ibv_recv_wr *rsp_recv_wrs; - struct spdk_nvme_send_wr_list sends_to_post; struct spdk_nvme_recv_wr_list recvs_to_post; /* Memory region describing all rsps for this qpair */ @@ -364,14 +363,21 @@ nvme_rdma_req_put(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdm { rdma_req->completion_flags = 0; rdma_req->req = NULL; - TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link); TAILQ_INSERT_HEAD(&rqpair->free_reqs, rdma_req, link); } static void -nvme_rdma_req_complete(struct nvme_request *req, +nvme_rdma_req_complete(struct spdk_nvme_rdma_req *rdma_req, struct spdk_nvme_cpl *rsp) { + struct nvme_request *req = rdma_req->req; + struct nvme_rdma_qpair *rqpair; + + assert(req != NULL); + + rqpair = nvme_rdma_qpair(req->qpair); + TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link); + nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, rsp); nvme_free_request(req); } @@ -665,7 +671,7 @@ static inline int nvme_rdma_qpair_submit_recvs(struct nvme_rdma_qpair *rqpair) { struct ibv_recv_wr *bad_recv_wr; - int rc; + int rc = 0; if (rqpair->recvs_to_post.first) { rc = ibv_post_recv(rqpair->rdma_qp->qp, rqpair->recvs_to_post.first, &bad_recv_wr); @@ -677,12 +683,11 @@ nvme_rdma_qpair_submit_recvs(struct nvme_rdma_qpair *rqpair) rqpair->current_num_recvs--; bad_recv_wr = bad_recv_wr->next; } - return rc; } rqpair->recvs_to_post.first = NULL; } - return 0; + return rc; } /* Append the given send wr structure to the qpair's outstanding sends list. */ @@ -1090,7 +1095,7 @@ nvme_rdma_connect(struct nvme_rdma_qpair *rqpair) return -EAGAIN; } else if (ret) { SPDK_ERRLOG("RDMA connect error %d\n", ret); - return -1; + return ret; } else { return 0; } @@ -1350,7 +1355,7 @@ _nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_q if (rc < 0) { rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n"); - return -1; + return rc; } return 0; @@ -1379,7 +1384,7 @@ nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qp } while (rc == -EAGAIN && retry_count < NVME_RDMA_STALE_CONN_RETRY_MAX); } - return rc == -EAGAIN ? -1 : rc; + return rc; } /* @@ -1511,6 +1516,12 @@ nvme_rdma_build_contig_request(struct nvme_rdma_qpair *rqpair, assert(req->payload_size != 0); assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + if (spdk_unlikely(req->payload_size > NVME_RDMA_MAX_KEYED_SGL_LENGTH)) { + SPDK_ERRLOG("SGL length %u exceeds max keyed SGL block size %u\n", + req->payload_size, NVME_RDMA_MAX_KEYED_SGL_LENGTH); + return -1; + } + if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, payload, req->payload_size, NVME_RDMA_MR_RKEY, &rkey))) { return -1; @@ -1569,6 +1580,12 @@ nvme_rdma_build_sgl_request(struct nvme_rdma_qpair *rqpair, sge_length = spdk_min(remaining_size, sge_length); + if (spdk_unlikely(sge_length > NVME_RDMA_MAX_KEYED_SGL_LENGTH)) { + SPDK_ERRLOG("SGL length %u exceeds max keyed SGL block size %u\n", + sge_length, NVME_RDMA_MAX_KEYED_SGL_LENGTH); + return -1; + } + if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, virt_addr, sge_length, NVME_RDMA_MR_RKEY, &rkey))) { return -1; @@ -1617,12 +1634,18 @@ nvme_rdma_build_sgl_request(struct nvme_rdma_qpair *rqpair, * Otherwise, The SGL descriptor embedded in the command must point to the list of * SGL descriptors used to describe the operation. In that case it is a last segment descriptor. */ - rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd) + sizeof(struct - spdk_nvme_sgl_descriptor) * num_sgl_desc; + uint32_t descriptors_size = sizeof(struct spdk_nvme_sgl_descriptor) * num_sgl_desc; + + if (spdk_unlikely(descriptors_size > rqpair->qpair.ctrlr->ioccsz_bytes)) { + SPDK_ERRLOG("Size of SGL descriptors (%u) exceeds ICD (%u)\n", + descriptors_size, rqpair->qpair.ctrlr->ioccsz_bytes); + return -1; + } + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd) + descriptors_size; req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; - req->cmd.dptr.sgl1.unkeyed.length = num_sgl_desc * sizeof(struct spdk_nvme_sgl_descriptor); + req->cmd.dptr.sgl1.unkeyed.length = descriptors_size; req->cmd.dptr.sgl1.address = (uint64_t)0; } @@ -1731,6 +1754,7 @@ nvme_rdma_req_init(struct nvme_rdma_qpair *rqpair, struct nvme_request *req, } if (rc) { + rdma_req->req = NULL; return rc; } @@ -1817,17 +1841,17 @@ nvme_rdma_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme } if (rqpair->cm_id) { - spdk_rdma_qp_disconnect(rqpair->rdma_qp); - if (rctrlr != NULL) { - if (nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_DISCONNECTED)) { - SPDK_DEBUGLOG(SPDK_LOG_NVME, "Target did not respond to qpair disconnect.\n"); - } - } - if (rqpair->rdma_qp) { + spdk_rdma_qp_disconnect(rqpair->rdma_qp); + if (rctrlr != NULL) { + if (nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_DISCONNECTED)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Target did not respond to qpair disconnect.\n"); + } + } spdk_rdma_qp_destroy(rqpair->rdma_qp); rqpair->rdma_qp = NULL; } + rdma_destroy_id(rqpair->cm_id); rqpair->cm_id = NULL; } @@ -1848,6 +1872,7 @@ nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ rqpair = nvme_rdma_qpair(qpair); nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); if (rqpair->defer_deletion_to_pg) { + nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING); return 0; } @@ -2054,6 +2079,7 @@ nvme_rdma_qpair_submit_request(struct spdk_nvme_qpair *qpair, if (nvme_rdma_req_init(rqpair, req, rdma_req)) { SPDK_ERRLOG("nvme_rdma_req_init() failed\n"); + TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link); nvme_rdma_req_put(rqpair, rdma_req); return -1; } @@ -2075,7 +2101,6 @@ static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) { struct spdk_nvme_rdma_req *rdma_req, *tmp; - struct nvme_request *req; struct spdk_nvme_cpl cpl; struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); @@ -2094,10 +2119,7 @@ nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) } TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { - assert(rdma_req->req != NULL); - req = rdma_req->req; - - nvme_rdma_req_complete(req, &cpl); + nvme_rdma_req_complete(rdma_req, &cpl); nvme_rdma_req_put(rqpair, rdma_req); } } @@ -2144,7 +2166,7 @@ nvme_rdma_qpair_check_timeout(struct spdk_nvme_qpair *qpair) static inline int nvme_rdma_request_ready(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req) { - nvme_rdma_req_complete(rdma_req->req, &rqpair->rsps[rdma_req->rsp_idx].cpl); + nvme_rdma_req_complete(rdma_req, &rqpair->rsps[rdma_req->rsp_idx].cpl); nvme_rdma_req_put(rqpair, rdma_req); return nvme_rdma_post_recv(rqpair, rdma_req->rsp_idx); } @@ -2386,11 +2408,33 @@ nvme_rdma_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) return rctrlr->max_sge; } +static int +nvme_rdma_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg) +{ + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct spdk_nvme_rdma_req *rdma_req, *tmp; + int rc; + + assert(iter_fn != NULL); + + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { + assert(rdma_req->req != NULL); + + rc = iter_fn(rdma_req->req, arg); + if (rc != 0) { + return rc; + } + } + + return 0; +} + static void nvme_rdma_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) { struct spdk_nvme_rdma_req *rdma_req, *tmp; - struct nvme_request *req; struct spdk_nvme_cpl cpl; struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); @@ -2398,13 +2442,13 @@ nvme_rdma_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) cpl.status.sct = SPDK_NVME_SCT_GENERIC; TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { + assert(rdma_req->req != NULL); + if (rdma_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { continue; } - assert(rdma_req->req != NULL); - req = rdma_req->req; - nvme_rdma_req_complete(req, &cpl); + nvme_rdma_req_complete(rdma_req, &cpl); nvme_rdma_req_put(rqpair, rdma_req); } } @@ -2724,7 +2768,7 @@ nvme_rdma_poll_group_process_completions(struct spdk_nvme_transport_poll_group * STAILQ_FOREACH_SAFE(qpair_tracker, &group->destroyed_qpairs, link, tmp_qpair_tracker) { qpair_tracker->completed_cycles++; rqpair = qpair_tracker->destroyed_qpair_tracker; - if ((rqpair->current_num_sends == 0 && rqpair->current_num_sends == 0) || + if ((rqpair->current_num_sends == 0 && rqpair->current_num_recvs == 0) || qpair_tracker->completed_cycles > NVME_RDMA_DESTROYED_QPAIR_EXPIRATION_CYCLES) { nvme_rdma_poll_group_delete_qpair(group, qpair_tracker); } @@ -2792,6 +2836,7 @@ const struct spdk_nvme_transport_ops rdma_ops = { .qpair_reset = nvme_rdma_qpair_reset, .qpair_submit_request = nvme_rdma_qpair_submit_request, .qpair_process_completions = nvme_rdma_qpair_process_completions, + .qpair_iterate_requests = nvme_rdma_qpair_iterate_requests, .admin_qpair_abort_aers = nvme_rdma_admin_qpair_abort_aers, .poll_group_create = nvme_rdma_poll_group_create, diff --git a/lib/nvme/nvme_tcp.c b/lib/nvme/nvme_tcp.c index b8128ffffd0..e4cd2b5696e 100644 --- a/lib/nvme/nvme_tcp.c +++ b/lib/nvme/nvme_tcp.c @@ -81,7 +81,8 @@ struct nvme_tcp_qpair { TAILQ_HEAD(, nvme_tcp_pdu) send_queue; struct nvme_tcp_pdu recv_pdu; - struct nvme_tcp_pdu send_pdu; /* only for error pdu and init pdu */ + struct nvme_tcp_pdu *send_pdu; /* only for error pdu and init pdu */ + struct nvme_tcp_pdu *send_pdus; /* Used by tcp_reqs */ enum nvme_tcp_pdu_recv_state recv_state; struct nvme_tcp_req *tcp_reqs; @@ -117,9 +118,24 @@ struct nvme_tcp_req { uint32_t r2tl_remain; uint32_t active_r2ts; bool in_capsule_data; - struct nvme_tcp_pdu send_pdu; + /* It is used to track whether the req can be safely freed */ + union { + uint8_t raw; + struct { + /* The last send operation completed - kernel released send buffer */ + uint8_t send_ack : 1; + /* Data transfer completed - target send resp or last data bit */ + uint8_t data_recv : 1; + /* tcp_req is waiting for completion of the previous send operation (buffer reclaim notification + * from kernel) to send H2C */ + uint8_t h2c_send_waiting_ack : 1; + uint8_t reserved : 5; + } bits; + } ordering; + struct nvme_tcp_pdu *send_pdu; struct iovec iov[NVME_TCP_MAX_SGL_DESCRIPTORS]; uint32_t iovcnt; + struct nvme_tcp_qpair *tqpair; TAILQ_ENTRY(nvme_tcp_req) link; }; @@ -164,7 +180,8 @@ nvme_tcp_req_get(struct nvme_tcp_qpair *tqpair) tcp_req->r2tl_remain = 0; tcp_req->active_r2ts = 0; tcp_req->iovcnt = 0; - memset(&tcp_req->send_pdu, 0, sizeof(tcp_req->send_pdu)); + tcp_req->ordering.raw = 0; + memset(tcp_req->send_pdu, 0, sizeof(struct nvme_tcp_pdu)); TAILQ_INSERT_TAIL(&tqpair->outstanding_reqs, tcp_req, link); return tcp_req; @@ -175,8 +192,7 @@ nvme_tcp_req_put(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req) { assert(tcp_req->state != NVME_TCP_REQ_FREE); tcp_req->state = NVME_TCP_REQ_FREE; - TAILQ_REMOVE(&tqpair->outstanding_reqs, tcp_req, link); - TAILQ_INSERT_TAIL(&tqpair->free_reqs, tcp_req, link); + TAILQ_INSERT_HEAD(&tqpair->free_reqs, tcp_req, link); } static int @@ -199,7 +215,7 @@ nvme_tcp_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, c if (res->ai_addrlen > sizeof(*sa)) { SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen); - ret = EINVAL; + ret = -EINVAL; } else { memcpy(sa, res->ai_addr, res->ai_addrlen); } @@ -213,6 +229,9 @@ nvme_tcp_free_reqs(struct nvme_tcp_qpair *tqpair) { free(tqpair->tcp_reqs); tqpair->tcp_reqs = NULL; + + spdk_free(tqpair->send_pdus); + tqpair->send_pdus = NULL; } static int @@ -223,7 +242,17 @@ nvme_tcp_alloc_reqs(struct nvme_tcp_qpair *tqpair) tqpair->tcp_reqs = calloc(tqpair->num_entries, sizeof(struct nvme_tcp_req)); if (tqpair->tcp_reqs == NULL) { - SPDK_ERRLOG("Failed to allocate tcp_reqs\n"); + SPDK_ERRLOG("Failed to allocate tcp_reqs on tqpair=%p\n", tqpair); + goto fail; + } + + /* Add additional one member for the send_pdu owned by the tqpair */ + tqpair->send_pdus = spdk_zmalloc((tqpair->num_entries + 1) * sizeof(struct nvme_tcp_pdu), + 0x1000, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + + if (tqpair->send_pdus == NULL) { + SPDK_ERRLOG("Failed to allocate send_pdus on tqpair=%p\n", tqpair); goto fail; } @@ -233,9 +262,13 @@ nvme_tcp_alloc_reqs(struct nvme_tcp_qpair *tqpair) for (i = 0; i < tqpair->num_entries; i++) { tcp_req = &tqpair->tcp_reqs[i]; tcp_req->cid = i; + tcp_req->tqpair = tqpair; + tcp_req->send_pdu = &tqpair->send_pdus[i]; TAILQ_INSERT_TAIL(&tqpair->free_reqs, tcp_req, link); } + tqpair->send_pdu = &tqpair->send_pdus[i]; + return 0; fail: nvme_tcp_free_reqs(tqpair); @@ -463,7 +496,7 @@ nvme_tcp_req_init(struct nvme_tcp_qpair *tqpair, struct nvme_request *req, if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { max_incapsule_data_size = ctrlr->ioccsz_bytes; if ((req->cmd.opc == SPDK_NVME_OPC_FABRIC) || nvme_qpair_is_admin_queue(&tqpair->qpair)) { - max_incapsule_data_size = spdk_min(max_incapsule_data_size, NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE); + max_incapsule_data_size = NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE; } if (req->payload_size <= max_incapsule_data_size) { @@ -477,9 +510,28 @@ nvme_tcp_req_init(struct nvme_tcp_qpair *tqpair, struct nvme_request *req, return 0; } +static inline void +nvme_tcp_req_put_safe(struct nvme_tcp_req *tcp_req) +{ + if (tcp_req->ordering.bits.send_ack && tcp_req->ordering.bits.data_recv) { + assert(tcp_req->state == NVME_TCP_REQ_ACTIVE); + assert(tcp_req->tqpair != NULL); + nvme_tcp_req_put(tcp_req->tqpair, tcp_req); + } +} + static void nvme_tcp_qpair_cmd_send_complete(void *cb_arg) { + struct nvme_tcp_req *tcp_req = cb_arg; + + tcp_req->ordering.bits.send_ack = 1; + /* Handle the r2t case */ + if (spdk_unlikely(tcp_req->ordering.bits.h2c_send_waiting_ack)) { + nvme_tcp_send_h2c_data(tcp_req); + } else { + nvme_tcp_req_put_safe(tcp_req); + } } static int @@ -492,7 +544,7 @@ nvme_tcp_qpair_capsule_cmd_send(struct nvme_tcp_qpair *tqpair, uint8_t pdo; SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); - pdu = &tcp_req->send_pdu; + pdu = tcp_req->send_pdu; capsule_cmd = &pdu->hdr.capsule_cmd; capsule_cmd->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD; @@ -535,7 +587,7 @@ nvme_tcp_qpair_capsule_cmd_send(struct nvme_tcp_qpair *tqpair, 0, tcp_req->req->payload_size); end: capsule_cmd->common.plen = plen; - return nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_qpair_cmd_send_complete, NULL); + return nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_qpair_cmd_send_complete, tcp_req); } @@ -558,6 +610,7 @@ nvme_tcp_qpair_submit_request(struct spdk_nvme_qpair *qpair, if (nvme_tcp_req_init(tqpair, req, tcp_req)) { SPDK_ERRLOG("nvme_tcp_req_init() failed\n"); + TAILQ_REMOVE(&tcp_req->tqpair->outstanding_reqs, tcp_req, link); nvme_tcp_req_put(tqpair, tcp_req); return -1; } @@ -572,9 +625,15 @@ nvme_tcp_qpair_reset(struct spdk_nvme_qpair *qpair) } static void -nvme_tcp_req_complete(struct nvme_request *req, +nvme_tcp_req_complete(struct nvme_tcp_req *tcp_req, struct spdk_nvme_cpl *rsp) { + struct nvme_request *req; + + assert(tcp_req->req != NULL); + req = tcp_req->req; + + TAILQ_REMOVE(&tcp_req->tqpair->outstanding_reqs, tcp_req, link); nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, rsp); nvme_free_request(req); } @@ -583,7 +642,6 @@ static void nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) { struct nvme_tcp_req *tcp_req, *tmp; - struct nvme_request *req; struct spdk_nvme_cpl cpl; struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); @@ -592,10 +650,7 @@ nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) cpl.status.dnr = dnr; TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { - assert(tcp_req->req != NULL); - req = tcp_req->req; - - nvme_tcp_req_complete(req, &cpl); + nvme_tcp_req_complete(tcp_req, &cpl); nvme_tcp_req_put(tqpair, tcp_req); } } @@ -641,7 +696,7 @@ nvme_tcp_qpair_send_h2c_term_req(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_ uint32_t h2c_term_req_hdr_len = sizeof(*h2c_term_req); uint8_t copy_len; - rsp_pdu = &tqpair->send_pdu; + rsp_pdu = tqpair->send_pdu; memset(rsp_pdu, 0, sizeof(*rsp_pdu)); h2c_term_req = &rsp_pdu->hdr.term_req; h2c_term_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ; @@ -798,9 +853,13 @@ nvme_tcp_c2h_data_payload_handle(struct nvme_tcp_qpair *tqpair, cpl.cid = tcp_req->cid; cpl.sqid = tqpair->qpair.id; - nvme_tcp_req_complete(tcp_req->req, &cpl); - nvme_tcp_req_put(tqpair, tcp_req); - (*reaped)++; + nvme_tcp_req_complete(tcp_req, &cpl); + if (tcp_req->ordering.bits.send_ack) { + (*reaped)++; + } + + tcp_req->ordering.bits.data_recv = 1; + nvme_tcp_req_put_safe(tcp_req); } } @@ -923,8 +982,9 @@ nvme_tcp_icresp_handle(struct nvme_tcp_qpair *tqpair, SPDK_DEBUGLOG(SPDK_LOG_NVME, "host_ddgst_enable: %u\n", tqpair->host_ddgst_enable); /* Now that we know whether digests are enabled, properly size the receive buffer to - * handle 4 incoming 4K read commands. */ - recv_buf_size = 0x1000 + sizeof(struct spdk_nvme_tcp_cmd); + * handle several incoming 4K read commands according to SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR + * parameter. */ + recv_buf_size = 0x1000 + sizeof(struct spdk_nvme_tcp_c2h_data_hdr); if (tqpair->host_hdgst_enable) { recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN; @@ -934,7 +994,7 @@ nvme_tcp_icresp_handle(struct nvme_tcp_qpair *tqpair, recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN; } - if (spdk_sock_set_recvbuf(tqpair->sock, recv_buf_size * 4) < 0) { + if (spdk_sock_set_recvbuf(tqpair->sock, recv_buf_size * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR) < 0) { SPDK_WARNLOG("Unable to allocate enough memory for receive buffer on tqpair=%p with size=%d\n", tqpair, recv_buf_size); @@ -975,11 +1035,13 @@ nvme_tcp_capsule_resp_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_ } - assert(tcp_req->req != NULL); - assert(tcp_req->state == NVME_TCP_REQ_ACTIVE); - nvme_tcp_req_complete(tcp_req->req, &cpl); - nvme_tcp_req_put(tqpair, tcp_req); - (*reaped)++; + nvme_tcp_req_complete(tcp_req, &cpl); + if (tcp_req->ordering.bits.send_ack) { + (*reaped)++; + } + + tcp_req->ordering.bits.data_recv = 1; + nvme_tcp_req_put_safe(tcp_req); SPDK_DEBUGLOG(SPDK_LOG_NVME, "complete tcp_req(%p) on tqpair=%p\n", tcp_req, tqpair); @@ -1081,12 +1143,15 @@ nvme_tcp_qpair_h2c_data_send_complete(void *cb_arg) assert(tcp_req != NULL); + tcp_req->ordering.bits.send_ack = 1; if (tcp_req->r2tl_remain) { nvme_tcp_send_h2c_data(tcp_req); } else { assert(tcp_req->active_r2ts > 0); tcp_req->active_r2ts--; tcp_req->state = NVME_TCP_REQ_ACTIVE; + /* Need also call this function to free the resource */ + nvme_tcp_req_put_safe(tcp_req); } } @@ -1098,7 +1163,10 @@ nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req) struct spdk_nvme_tcp_h2c_data_hdr *h2c_data; uint32_t plen, pdo, alignment; - rsp_pdu = &tcp_req->send_pdu; + /* Reinit the send_ack and h2c_send_waiting_ack bits */ + tcp_req->ordering.bits.send_ack = 0; + tcp_req->ordering.bits.h2c_send_waiting_ack = 0; + rsp_pdu = tcp_req->send_pdu; memset(rsp_pdu, 0, sizeof(*rsp_pdu)); h2c_data = &rsp_pdu->hdr.h2c_data; @@ -1200,7 +1268,12 @@ nvme_tcp_r2t_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu) tcp_req->r2tl_remain = r2t->r2tl; nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); - nvme_tcp_send_h2c_data(tcp_req); + if (spdk_likely(tcp_req->ordering.bits.send_ack)) { + nvme_tcp_send_h2c_data(tcp_req); + } else { + tcp_req->ordering.bits.h2c_send_waiting_ack = 1; + } + return; end: @@ -1476,8 +1549,8 @@ nvme_tcp_qpair_icreq_send(struct nvme_tcp_qpair *tqpair) uint64_t icreq_timeout_tsc; int rc; - pdu = &tqpair->send_pdu; - memset(&tqpair->send_pdu, 0, sizeof(tqpair->send_pdu)); + pdu = tqpair->send_pdu; + memset(tqpair->send_pdu, 0, sizeof(*tqpair->send_pdu)); ic_req = &pdu->hdr.ic_req; ic_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_IC_REQ; @@ -1529,7 +1602,8 @@ nvme_tcp_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpa break; default: SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam); - return -1; + rc = -1; + return rc; } SPDK_DEBUGLOG(SPDK_LOG_NVME, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family); @@ -1540,7 +1614,7 @@ nvme_tcp_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpa rc = nvme_tcp_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid); if (rc != 0) { SPDK_ERRLOG("dst_addr nvme_tcp_parse_addr() failed\n"); - return -1; + return rc; } if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) { @@ -1548,14 +1622,15 @@ nvme_tcp_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpa rc = nvme_tcp_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid); if (rc != 0) { SPDK_ERRLOG("src_addr nvme_tcp_parse_addr() failed\n"); - return -1; + return rc; } } port = spdk_strtol(ctrlr->trid.trsvcid, 10); if (port <= 0 || port >= INT_MAX) { SPDK_ERRLOG("Invalid port: %s\n", ctrlr->trid.trsvcid); - return -1; + rc = -1; + return rc; } opts.opts_size = sizeof(opts); @@ -1565,7 +1640,8 @@ nvme_tcp_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpa if (!tqpair->sock) { SPDK_ERRLOG("sock connection error of tqpair=%p with addr=%s, port=%ld\n", tqpair, ctrlr->trid.traddr, port); - return -1; + rc = -1; + return rc; } tqpair->maxr2t = NVME_TCP_MAX_R2T_DEFAULT; @@ -1577,13 +1653,13 @@ nvme_tcp_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpa rc = nvme_tcp_qpair_icreq_send(tqpair); if (rc != 0) { SPDK_ERRLOG("Unable to connect the tqpair\n"); - return -1; + return rc; } rc = nvme_fabric_qpair_connect(&tqpair->qpair, tqpair->num_entries); if (rc < 0) { SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n"); - return -1; + return rc; } return 0; @@ -1712,11 +1788,33 @@ nvme_tcp_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) return 1; } +static int +nvme_tcp_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg) +{ + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + struct nvme_tcp_req *tcp_req, *tmp; + int rc; + + assert(iter_fn != NULL); + + TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { + assert(tcp_req->req != NULL); + + rc = iter_fn(tcp_req->req, arg); + if (rc != 0) { + return rc; + } + } + + return 0; +} + static void nvme_tcp_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) { struct nvme_tcp_req *tcp_req, *tmp; - struct nvme_request *req; struct spdk_nvme_cpl cpl; struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); @@ -1725,12 +1823,11 @@ nvme_tcp_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { assert(tcp_req->req != NULL); - req = tcp_req->req; - if (req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { + if (tcp_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { continue; } - nvme_tcp_req_complete(req, &cpl); + nvme_tcp_req_complete(tcp_req, &cpl); nvme_tcp_req_put(tqpair, tcp_req); } } @@ -1874,6 +1971,7 @@ const struct spdk_nvme_transport_ops tcp_ops = { .qpair_reset = nvme_tcp_qpair_reset, .qpair_submit_request = nvme_tcp_qpair_submit_request, .qpair_process_completions = nvme_tcp_qpair_process_completions, + .qpair_iterate_requests = nvme_tcp_qpair_iterate_requests, .admin_qpair_abort_aers = nvme_tcp_admin_qpair_abort_aers, .poll_group_create = nvme_tcp_poll_group_create, diff --git a/lib/nvme/nvme_transport.c b/lib/nvme/nvme_transport.c index a499023f101..76efd59660c 100644 --- a/lib/nvme/nvme_transport.c +++ b/lib/nvme/nvme_transport.c @@ -278,7 +278,16 @@ nvme_transport_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid int nvme_transport_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) { - return qpair->transport->ops.ctrlr_delete_io_qpair(ctrlr, qpair); + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + + /* Do not rely on qpair->transport. For multi-process cases, a foreign process may delete + * the IO qpair, in which case the transport object would be invalid (each process has their + * own unique transport objects since they contain function pointers). So we look up the + * transport object in the delete_io_qpair case. + */ + return transport->ops.ctrlr_delete_io_qpair(ctrlr, qpair); } int @@ -399,6 +408,22 @@ nvme_transport_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t return transport->ops.qpair_process_completions(qpair, max_completions); } +int +nvme_transport_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg) +{ + const struct spdk_nvme_transport *transport; + + if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) { + return qpair->transport->ops.qpair_iterate_requests(qpair, iter_fn, arg); + } + + transport = nvme_get_transport(qpair->ctrlr->trid.trstring); + assert(transport != NULL); + return transport->ops.qpair_iterate_requests(qpair, iter_fn, arg); +} + void nvme_transport_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) { diff --git a/lib/nvme/nvme_uevent.c b/lib/nvme/nvme_uevent.c index 1bcfff1cbd4..b03cefd3481 100644 --- a/lib/nvme/nvme_uevent.c +++ b/lib/nvme/nvme_uevent.c @@ -43,13 +43,14 @@ #include #define SPDK_UEVENT_MSG_LEN 4096 +#define SPDK_UEVENT_RECVBUF_SIZE 1024 * 1024 int nvme_uevent_connect(void) { struct sockaddr_nl addr; int netlink_fd; - int size = 64 * 1024; + int size = SPDK_UEVENT_RECVBUF_SIZE; int flag; memset(&addr, 0, sizeof(addr)); @@ -144,9 +145,7 @@ parse_event(const char *buf, struct spdk_uevent *event) return -1; } spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr); - return 1; - } - if (!strncmp(driver, "vfio-pci", 8)) { + } else if (!strncmp(driver, "vfio-pci", 8)) { struct spdk_pci_addr pci_addr; event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_VFIO; @@ -161,10 +160,11 @@ parse_event(const char *buf, struct spdk_uevent *event) return -1; } spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr); - return 1; - + } else { + event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_UNRECOGNIZED; } - return -1; + + return 1; } int diff --git a/lib/nvme/nvme_uevent.h b/lib/nvme/nvme_uevent.h index 778d73c2a2a..94f67101e71 100644 --- a/lib/nvme/nvme_uevent.h +++ b/lib/nvme/nvme_uevent.h @@ -41,6 +41,7 @@ #ifndef SPDK_UEVENT_H_ #define SPDK_UEVENT_H_ +#define SPDK_NVME_UEVENT_SUBSYSTEM_UNRECOGNIZED 0 #define SPDK_NVME_UEVENT_SUBSYSTEM_UIO 1 #define SPDK_NVME_UEVENT_SUBSYSTEM_VFIO 2 diff --git a/lib/nvme/spdk_nvme.map b/lib/nvme/spdk_nvme.map index b674c8f5019..b110f7f30d4 100644 --- a/lib/nvme/spdk_nvme.map +++ b/lib/nvme/spdk_nvme.map @@ -62,6 +62,7 @@ spdk_nvme_ctrlr_cmd_get_log_page; spdk_nvme_ctrlr_cmd_get_log_page_ext; spdk_nvme_ctrlr_cmd_abort; + spdk_nvme_ctrlr_cmd_abort_ext; spdk_nvme_ctrlr_cmd_set_feature; spdk_nvme_ctrlr_cmd_get_feature; spdk_nvme_ctrlr_cmd_get_feature_ns; @@ -82,6 +83,8 @@ spdk_nvme_ctrlr_map_cmb; spdk_nvme_ctrlr_unmap_cmb; spdk_nvme_ctrlr_get_transport_id; + spdk_nvme_ctrlr_alloc_qid; + spdk_nvme_ctrlr_free_qid; spdk_nvme_poll_group_create; spdk_nvme_poll_group_add; @@ -106,6 +109,7 @@ spdk_nvme_ns_get_dealloc_logical_block_read_value; spdk_nvme_ns_get_optimal_io_boundary; spdk_nvme_ns_get_uuid; + spdk_nvme_ns_get_csi; spdk_nvme_ns_get_flags; spdk_nvme_ns_cmd_write; @@ -135,6 +139,8 @@ spdk_nvme_qpair_remove_cmd_error_injection; spdk_nvme_qpair_print_command; spdk_nvme_qpair_print_completion; + spdk_nvme_print_command; + spdk_nvme_print_completion; spdk_nvme_cpl_get_status_string; diff --git a/lib/nvmf/Makefile b/lib/nvmf/Makefile index 27cb26a041d..150fcd3e4b1 100644 --- a/lib/nvmf/Makefile +++ b/lib/nvmf/Makefile @@ -34,7 +34,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk -SO_VER := 5 +SO_VER := 6 SO_MINOR := 0 C_SRCS = ctrlr.c ctrlr_discovery.c ctrlr_bdev.c \ diff --git a/lib/nvmf/ctrlr.c b/lib/nvmf/ctrlr.c index 46b16a1fba4..92a4a1c27a8 100644 --- a/lib/nvmf/ctrlr.c +++ b/lib/nvmf/ctrlr.c @@ -59,6 +59,8 @@ */ #define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING +#define ANA_TRANSITION_TIME_IN_SEC 10 + /* * Support for custom admin command handlers */ @@ -102,6 +104,23 @@ nvmf_ctrlr_stop_keep_alive_timer(struct spdk_nvmf_ctrlr *ctrlr) spdk_poller_unregister(&ctrlr->keep_alive_poller); } +static void +nvmf_ctrlr_stop_association_timer(struct spdk_nvmf_ctrlr *ctrlr) +{ + if (!ctrlr) { + SPDK_ERRLOG("Controller is NULL\n"); + assert(false); + return; + } + + if (ctrlr->association_timer == NULL) { + return; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Stop association timer\n"); + spdk_poller_unregister(&ctrlr->association_timer); +} + static void nvmf_ctrlr_disconnect_qpairs_done(struct spdk_io_channel_iter *i, int status) { @@ -181,7 +200,7 @@ nvmf_ctrlr_keep_alive_poll(void *ctx) } } - return 1; + return SPDK_POLLER_BUSY; } static void @@ -330,6 +349,9 @@ nvmf_ctrlr_create(struct spdk_nvmf_subsystem *subsystem, } ctrlr->feat.async_event_configuration.bits.ns_attr_notice = 1; + if (ctrlr->subsys->ana_reporting) { + ctrlr->feat.async_event_configuration.bits.ana_change_notice = 1; + } ctrlr->feat.volatile_write_cache.bits.wce = 1; if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { @@ -390,6 +412,16 @@ nvmf_ctrlr_create(struct spdk_nvmf_subsystem *subsystem, ctrlr->dif_insert_or_strip = transport->opts.dif_insert_or_strip; + if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_NVME) { + ctrlr->listener = nvmf_subsystem_find_listener(ctrlr->subsys, + req->qpair->trid); + if (!ctrlr->listener) { + SPDK_ERRLOG("Listener was not found\n"); + free(ctrlr); + return NULL; + } + } + req->qpair->ctrlr = ctrlr; spdk_thread_send_msg(subsystem->thread, _nvmf_subsystem_add_ctrlr, req); @@ -403,6 +435,8 @@ _nvmf_ctrlr_destruct(void *ctx) struct spdk_nvmf_reservation_log *log, *log_tmp; nvmf_ctrlr_stop_keep_alive_timer(ctrlr); + nvmf_ctrlr_stop_association_timer(ctrlr); + spdk_bit_array_free(&ctrlr->qpair_mask); TAILQ_FOREACH_SAFE(log, &ctrlr->log_head, link, log_tmp) { TAILQ_REMOVE(&ctrlr->log_head, log, link); @@ -432,6 +466,13 @@ nvmf_ctrlr_add_io_qpair(void *ctx) */ qpair->ctrlr = NULL; + /* Make sure the controller is not being destroyed. */ + if (ctrlr->in_destruct) { + SPDK_ERRLOG("Got I/O connect while ctrlr was being destroyed.\n"); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + goto end; + } + if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { SPDK_ERRLOG("I/O connect not allowed on discovery controller\n"); SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); @@ -474,6 +515,7 @@ _nvmf_ctrlr_add_io_qpair(void *ctx) struct spdk_nvmf_qpair *admin_qpair; struct spdk_nvmf_tgt *tgt = qpair->transport->tgt; struct spdk_nvmf_subsystem *subsystem; + const struct spdk_nvmf_subsystem_listener *listener; SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect I/O Queue for controller id 0x%x\n", data->cntlid); @@ -489,6 +531,25 @@ _nvmf_ctrlr_add_io_qpair(void *ctx) return; } + /* fail before passing a message to the controller thread. */ + if (ctrlr->in_destruct) { + SPDK_ERRLOG("Got I/O connect while ctrlr was being destroyed.\n"); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + spdk_nvmf_request_complete(req); + return; + } + + /* If ANA reporting is enabled, check if I/O connect is on the same listener. */ + if (subsystem->ana_reporting) { + listener = nvmf_subsystem_find_listener(subsystem, qpair->trid); + if (listener != ctrlr->listener) { + SPDK_ERRLOG("I/O connect is on a listener different from admin connect\n"); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + spdk_nvmf_request_complete(req); + return; + } + } + admin_qpair = ctrlr->admin_qpair; qpair->ctrlr = ctrlr; spdk_thread_send_msg(admin_qpair->group->thread, nvmf_ctrlr_add_io_qpair, req); @@ -720,6 +781,43 @@ nvmf_ctrlr_cmd_connect(struct spdk_nvmf_request *req) return _nvmf_ctrlr_connect(req); } +static int +nvmf_ctrlr_association_remove(void *ctx) +{ + struct spdk_nvmf_ctrlr *ctrlr = ctx; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Disconnecting host from subsystem %s due to association timeout.\n", + ctrlr->subsys->subnqn); + + rc = spdk_nvmf_qpair_disconnect(ctrlr->admin_qpair, NULL, NULL); + if (rc < 0) { + SPDK_ERRLOG("Fail to disconnect admin ctrlr qpair\n"); + assert(false); + } + + nvmf_ctrlr_stop_association_timer(ctrlr); + return 1; +} + +static void +nvmf_ctrlr_cc_shn_done(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_nvmf_ctrlr *ctrlr = spdk_io_channel_iter_get_ctx(i); + + if (status < 0) { + SPDK_ERRLOG("Fail to disconnect io ctrlr qpairs\n"); + assert(false); + } + + ctrlr->vcprop.csts.bits.shst = SPDK_NVME_SHST_COMPLETE; + + /* After CC.EN transitions to 0 (due to shutdown or reset), the association + * between the host and controller shall be preserved for at least 2 minutes */ + ctrlr->association_timer = SPDK_POLLER_REGISTER(nvmf_ctrlr_association_remove, ctrlr, + ctrlr->admin_qpair->transport->opts.association_timeout * 1000); +} + static void nvmf_ctrlr_cc_reset_done(struct spdk_io_channel_iter *i, int status) { @@ -734,6 +832,10 @@ nvmf_ctrlr_cc_reset_done(struct spdk_io_channel_iter *i, int status) ctrlr->vcprop.cc.raw = 0; ctrlr->vcprop.csts.raw = 0; + /* After CC.EN transitions to 0 (due to shutdown or reset), the association + * between the host and controller shall be preserved for at least 2 minutes */ + ctrlr->association_timer = SPDK_POLLER_REGISTER(nvmf_ctrlr_association_remove, ctrlr, + ctrlr->admin_qpair->transport->opts.association_timeout * 1000); } const struct spdk_nvmf_registers * @@ -779,8 +881,9 @@ nvmf_prop_set_cc(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value) if (diff.bits.en) { if (cc.bits.en) { SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Enable!\n"); + nvmf_ctrlr_stop_association_timer(ctrlr); + ctrlr->vcprop.cc.bits.en = 1; - ctrlr->vcprop.csts.raw = 0; ctrlr->vcprop.csts.bits.rdy = 1; } else { SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Disable!\n"); @@ -799,12 +902,14 @@ nvmf_prop_set_cc(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value) SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Shutdown %u%ub!\n", cc.bits.shn >> 1, cc.bits.shn & 1); ctrlr->vcprop.cc.bits.shn = cc.bits.shn; - ctrlr->vcprop.cc.bits.en = 0; - ctrlr->vcprop.csts.bits.rdy = 0; - ctrlr->vcprop.csts.bits.shst = SPDK_NVME_SHST_COMPLETE; spdk_for_each_channel(ctrlr->subsys->tgt, nvmf_ctrlr_disconnect_io_qpairs_on_pg, - ctrlr, NULL); + ctrlr, + nvmf_ctrlr_cc_shn_done); + + /* From the time a shutdown is initiated the controller shall disable + * Keep Alive timer */ + nvmf_ctrlr_stop_keep_alive_timer(ctrlr); } else if (cc.bits.shn == 0) { ctrlr->vcprop.cc.bits.shn = 0; } else { @@ -871,8 +976,9 @@ nvmf_prop_set_aqa(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value) aqa.raw = value; - if (aqa.bits.asqs > ctrlr->vcprop.cap.bits.mqes || - aqa.bits.acqs > ctrlr->vcprop.cap.bits.mqes) { + if (aqa.bits.asqs < SPDK_NVME_ADMIN_QUEUE_MIN_ENTRIES - 1 || + aqa.bits.acqs < SPDK_NVME_ADMIN_QUEUE_MIN_ENTRIES - 1 || + aqa.bits.reserved1 != 0 || aqa.bits.reserved2 != 0) { return false; } @@ -1301,7 +1407,7 @@ nvmf_ctrlr_get_features_reservation_notification_mask(struct spdk_nvmf_request * SPDK_DEBUGLOG(SPDK_LOG_NVMF, "get Features - Reservation Notificaton Mask\n"); - if (cmd->nsid == 0xffffffffu) { + if (cmd->nsid == SPDK_NVME_GLOBAL_NS_TAG) { SPDK_ERRLOG("get Features - Invalid Namespace ID\n"); rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; @@ -1329,7 +1435,7 @@ nvmf_ctrlr_set_features_reservation_notification_mask(struct spdk_nvmf_request * SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Reservation Notificaton Mask\n"); - if (cmd->nsid == 0xffffffffu) { + if (cmd->nsid == SPDK_NVME_GLOBAL_NS_TAG) { for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL; ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) { ns->mask = cmd->cdw11; @@ -1359,7 +1465,7 @@ nvmf_ctrlr_get_features_reservation_persistence(struct spdk_nvmf_request *req) SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Reservation Persistence\n"); ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid); - /* NSID with 0xffffffffu also included */ + /* NSID with SPDK_NVME_GLOBAL_NS_TAG (=0xffffffff) also included */ if (ns == NULL) { SPDK_ERRLOG("Get Features - Invalid Namespace ID\n"); response->status.sct = SPDK_NVME_SCT_GENERIC; @@ -1388,9 +1494,9 @@ nvmf_ctrlr_set_features_reservation_persistence(struct spdk_nvmf_request *req) ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid); ptpl = cmd->cdw11_bits.feat_rsv_persistence.bits.ptpl; - if (cmd->nsid != 0xffffffffu && ns && ns->ptpl_file) { + if (cmd->nsid != SPDK_NVME_GLOBAL_NS_TAG && ns && ns->ptpl_file) { ns->ptpl_activated = ptpl; - } else if (cmd->nsid == 0xffffffffu) { + } else if (cmd->nsid == SPDK_NVME_GLOBAL_NS_TAG) { for (ns = spdk_nvmf_subsystem_get_first_ns(ctrlr->subsys); ns && ns->ptpl_file; ns = spdk_nvmf_subsystem_get_next_ns(ctrlr->subsys, ns)) { ns->ptpl_activated = ptpl; @@ -1499,6 +1605,11 @@ nvmf_ctrlr_async_event_request(struct spdk_nvmf_request *req) SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Async Event Request\n"); + /* AER cmd is an exception */ + sgroup = &req->qpair->group->sgroups[ctrlr->subsys->id]; + assert(sgroup != NULL); + sgroup->io_outstanding--; + /* Four asynchronous events are supported for now */ if (ctrlr->nr_aer_reqs >= NVMF_MAX_ASYNC_EVENTS) { SPDK_DEBUGLOG(SPDK_LOG_NVMF, "AERL exceeded\n"); @@ -1521,10 +1632,6 @@ nvmf_ctrlr_async_event_request(struct spdk_nvmf_request *req) return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; } - /* AER cmd is an exception */ - sgroup = &req->qpair->group->sgroups[ctrlr->subsys->id]; - sgroup->io_outstanding--; - ctrlr->aer_req[ctrlr->nr_aer_reqs++] = req; return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; } @@ -1548,6 +1655,79 @@ nvmf_get_firmware_slot_log_page(void *buffer, uint64_t offset, uint32_t length) } } +#define SPDK_NVMF_ANA_DESC_SIZE (sizeof(struct spdk_nvme_ana_group_descriptor) + \ + sizeof(uint32_t)) +static void +nvmf_get_ana_log_page(struct spdk_nvmf_ctrlr *ctrlr, void *data, + uint64_t offset, uint32_t length) +{ + char *buf = data; + struct spdk_nvme_ana_page ana_hdr; + char _ana_desc[SPDK_NVMF_ANA_DESC_SIZE]; + struct spdk_nvme_ana_group_descriptor *ana_desc; + size_t copy_len; + uint32_t num_ns = 0; + struct spdk_nvmf_ns *ns; + + if (length == 0) { + return; + } + + if (offset >= sizeof(ana_hdr)) { + offset -= sizeof(ana_hdr); + } else { + for (ns = spdk_nvmf_subsystem_get_first_ns(ctrlr->subsys); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(ctrlr->subsys, ns)) { + num_ns++; + } + + memset(&ana_hdr, 0, sizeof(ana_hdr)); + + ana_hdr.num_ana_group_desc = num_ns; + /* TODO: Support Change Count. */ + ana_hdr.change_count = 0; + + copy_len = spdk_min(sizeof(ana_hdr) - offset, length); + memcpy(buf, (const char *)&ana_hdr + offset, copy_len); + length -= copy_len; + buf += copy_len; + offset = 0; + } + + if (length == 0) { + return; + } + + ana_desc = (void *)_ana_desc; + + for (ns = spdk_nvmf_subsystem_get_first_ns(ctrlr->subsys); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(ctrlr->subsys, ns)) { + if (offset >= SPDK_NVMF_ANA_DESC_SIZE) { + offset -= SPDK_NVMF_ANA_DESC_SIZE; + continue; + } + + memset(ana_desc, 0, SPDK_NVMF_ANA_DESC_SIZE); + + ana_desc->ana_group_id = ns->nsid; + ana_desc->num_of_nsid = 1; + ana_desc->ana_state = ctrlr->listener->ana_state; + ana_desc->nsid[0] = ns->nsid; + /* TODO: Support Change Count. */ + ana_desc->change_count = 0; + + copy_len = spdk_min(SPDK_NVMF_ANA_DESC_SIZE - offset, length); + memcpy(buf, (const char *)ana_desc + offset, copy_len); + length -= copy_len; + buf += copy_len; + offset = 0; + + if (length == 0) { + return; + } + } +} + void nvmf_ctrlr_ns_changed(struct spdk_nvmf_ctrlr *ctrlr, uint32_t nsid) { @@ -1749,6 +1929,13 @@ nvmf_ctrlr_get_log_page(struct spdk_nvmf_request *req) case SPDK_NVME_LOG_FIRMWARE_SLOT: nvmf_get_firmware_slot_log_page(req->data, offset, len); return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + case SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS: + if (subsystem->ana_reporting) { + nvmf_get_ana_log_page(ctrlr, req->data, offset, len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } else { + goto invalid_log_page; + } case SPDK_NVME_LOG_COMMAND_EFFECTS_LOG: nvmf_get_cmds_and_effects_log_page(req->data, offset, len); return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; @@ -1809,6 +1996,16 @@ spdk_nvmf_ctrlr_identify_ns(struct spdk_nvmf_ctrlr *ctrlr, nsdata->noiob = max_num_blocks; } + if (subsystem->ana_reporting) { + /* ANA group ID matches NSID. */ + nsdata->anagrpid = ns->nsid; + + if (ctrlr->listener->ana_state == SPDK_NVME_ANA_INACCESSIBLE_STATE || + ctrlr->listener->ana_state == SPDK_NVME_ANA_PERSISTENT_LOSS_STATE) { + nsdata->nuse = 0; + } + } + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; } @@ -1858,6 +2055,9 @@ spdk_nvmf_ctrlr_identify_ctrlr(struct spdk_nvmf_ctrlr *ctrlr, struct spdk_nvme_c cdata->sgls = ctrlr->cdata.sgls; cdata->fuses.compare_and_write = 1; cdata->acwu = 1; + if (subsystem->ana_reporting) { + cdata->mnan = subsystem->max_nsid; + } spdk_strcpy_pad(cdata->subnqn, subsystem->subnqn, sizeof(cdata->subnqn), '\0'); SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ctrlr data: maxcmd 0x%x\n", cdata->maxcmd); @@ -1874,8 +2074,17 @@ spdk_nvmf_ctrlr_identify_ctrlr(struct spdk_nvmf_ctrlr *ctrlr, struct spdk_nvme_c cdata->rab = 6; cdata->cmic.multi_port = 1; cdata->cmic.multi_host = 1; + if (subsystem->ana_reporting) { + /* Asymmetric Namespace Access Reporting is supported. */ + cdata->cmic.ana_reporting = 1; + } cdata->oaes.ns_attribute_notices = 1; + if (subsystem->ana_reporting) { + cdata->oaes.ana_change_notices = 1; + } cdata->ctratt.host_id_exhid_supported = 1; + /* TODO: Concurrent execution of multiple abort commands. */ + cdata->acl = 0; cdata->aerl = 0; cdata->frmw.slot1_ro = 1; cdata->frmw.num_slots = 1; @@ -1895,6 +2104,19 @@ spdk_nvmf_ctrlr_identify_ctrlr(struct spdk_nvmf_ctrlr *ctrlr, struct spdk_nvme_c cdata->oncs.dsm = nvmf_ctrlr_dsm_supported(ctrlr); cdata->oncs.write_zeroes = nvmf_ctrlr_write_zeroes_supported(ctrlr); cdata->oncs.reservations = 1; + if (subsystem->ana_reporting) { + cdata->anatt = ANA_TRANSITION_TIME_IN_SEC; + /* ANA Change state is not used, and ANA Persistent Loss state + * is not supported for now. + */ + cdata->anacap.ana_optimized_state = 1; + cdata->anacap.ana_non_optimized_state = 1; + cdata->anacap.ana_inaccessible_state = 1; + /* ANAGRPID does not change while namespace is attached to controller */ + cdata->anacap.no_change_anagrpid = 1; + cdata->anagrpmax = subsystem->max_nsid; + cdata->nanagrpid = subsystem->max_nsid; + } nvmf_ctrlr_populate_oacs(ctrlr, cdata); @@ -2090,12 +2312,34 @@ nvmf_qpair_abort_aer(struct spdk_nvmf_qpair *qpair, uint16_t cid) return false; } +static void +nvmf_qpair_abort_request(struct spdk_nvmf_qpair *qpair, struct spdk_nvmf_request *req) +{ + uint16_t cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; + + if (nvmf_qpair_abort_aer(qpair, cid)) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "abort ctrlr=%p sqid=%u cid=%u successful\n", + qpair->ctrlr, qpair->qid, cid); + req->rsp->nvme_cpl.cdw0 &= ~1U; /* Command successfully aborted */ + + spdk_nvmf_request_complete(req); + return; + } + + nvmf_transport_qpair_abort_request(qpair, req); +} + static void nvmf_ctrlr_abort_done(struct spdk_io_channel_iter *i, int status) { struct spdk_nvmf_request *req = spdk_io_channel_iter_get_ctx(i); - _nvmf_request_complete(req); + if (status == 0) { + /* There was no qpair whose ID matches SQID of the abort command. + * Hence call _nvmf_request_complete() here. + */ + _nvmf_request_complete(req); + } } static void @@ -2104,34 +2348,18 @@ nvmf_ctrlr_abort_on_pg(struct spdk_io_channel_iter *i) struct spdk_nvmf_request *req = spdk_io_channel_iter_get_ctx(i); struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); struct spdk_nvmf_poll_group *group = spdk_io_channel_get_ctx(ch); - struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; - struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; - uint16_t sqid = cmd->cdw10_bits.abort.sqid; + uint16_t sqid = req->cmd->nvme_cmd.cdw10_bits.abort.sqid; struct spdk_nvmf_qpair *qpair; TAILQ_FOREACH(qpair, &group->qpairs, link) { if (qpair->ctrlr == req->qpair->ctrlr && qpair->qid == sqid) { - uint16_t cid = cmd->cdw10_bits.abort.cid; - /* Found the qpair */ - if (!nvmf_qpair_abort_aer(qpair, cid)) { - /* TODO: track list of outstanding requests in qpair? */ - SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cid %u not found\n", cid); - rsp->status.sct = SPDK_NVME_SCT_GENERIC; - rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; - spdk_for_each_channel_continue(i, -EINVAL); - return; - } + nvmf_qpair_abort_request(qpair, req); - SPDK_DEBUGLOG(SPDK_LOG_NVMF, "abort ctrlr=%p sqid=%u cid=%u successful\n", - qpair->ctrlr, sqid, cid); - rsp->cdw0 = 0; /* Command successfully aborted */ - rsp->status.sct = SPDK_NVME_SCT_GENERIC; - rsp->status.sc = SPDK_NVME_SC_SUCCESS; /* Return -1 for the status so the iteration across threads stops. */ spdk_for_each_channel_continue(i, -1); - + return; } } @@ -2143,9 +2371,9 @@ nvmf_ctrlr_abort(struct spdk_nvmf_request *req) { struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; - rsp->cdw0 = 1; /* Command not aborted */ - rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; - rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + rsp->cdw0 = 1U; /* Command not aborted */ + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_SUCCESS; /* Send a message to each poll group, searching for this ctrlr, sqid, and command. */ spdk_for_each_channel(req->qpair->ctrlr->subsys->tgt, @@ -2157,6 +2385,31 @@ nvmf_ctrlr_abort(struct spdk_nvmf_request *req) return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; } +int +nvmf_ctrlr_abort_request(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_request *req_to_abort = req->req_to_abort; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc; + struct spdk_io_channel *ch; + int rc; + + assert(req_to_abort != NULL); + + if (g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_ABORT].hdlr && + nvmf_qpair_is_admin_queue(req_to_abort->qpair)) { + return g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_ABORT].hdlr(req); + } + + rc = spdk_nvmf_request_get_bdev(req_to_abort->cmd->nvme_cmd.nsid, req_to_abort, + &bdev, &desc, &ch); + if (rc != 0) { + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return spdk_nvmf_bdev_ctrlr_abort_cmd(bdev, desc, ch, req, req_to_abort); +} + static int get_features_generic(struct spdk_nvmf_request *req, uint32_t cdw0) { @@ -2166,6 +2419,24 @@ get_features_generic(struct spdk_nvmf_request *req, uint32_t cdw0) return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; } +/* we have to use the typedef in the function declaration to appease astyle. */ +typedef enum spdk_nvme_path_status_code spdk_nvme_path_status_code_t; + +static spdk_nvme_path_status_code_t +_nvme_ana_state_to_path_status(enum spdk_nvme_ana_state ana_state) +{ + switch (ana_state) { + case SPDK_NVME_ANA_INACCESSIBLE_STATE: + return SPDK_NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE; + case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: + return SPDK_NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS; + case SPDK_NVME_ANA_CHANGE_STATE: + return SPDK_NVME_SC_ASYMMETRIC_ACCESS_TRANSITION; + default: + return SPDK_NVME_SC_INTERNAL_PATH_ERROR; + } +} + static int nvmf_ctrlr_get_features(struct spdk_nvmf_request *req) { @@ -2173,8 +2444,31 @@ nvmf_ctrlr_get_features(struct spdk_nvmf_request *req) struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + enum spdk_nvme_ana_state ana_state; feature = cmd->cdw10_bits.get_features.fid; + + ana_state = ctrlr->listener->ana_state; + switch (ana_state) { + case SPDK_NVME_ANA_INACCESSIBLE_STATE: + case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: + case SPDK_NVME_ANA_CHANGE_STATE: + switch (feature) { + case SPDK_NVME_FEAT_ERROR_RECOVERY: + case SPDK_NVME_FEAT_WRITE_ATOMICITY: + case SPDK_NVME_FEAT_HOST_RESERVE_MASK: + case SPDK_NVME_FEAT_HOST_RESERVE_PERSIST: + response->status.sct = SPDK_NVME_SCT_PATH; + response->status.sc = _nvme_ana_state_to_path_status(ana_state); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + default: + break; + } + break; + default: + break; + } + switch (feature) { case SPDK_NVME_FEAT_ARBITRATION: return get_features_generic(req, ctrlr->feat.arbitration.raw); @@ -2211,9 +2505,10 @@ static int nvmf_ctrlr_set_features(struct spdk_nvmf_request *req) { uint8_t feature, save; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; - + enum spdk_nvme_ana_state ana_state; /* * Features are not saveable by the controller as indicated by * ONCS field of the Identify Controller data. @@ -2226,6 +2521,37 @@ nvmf_ctrlr_set_features(struct spdk_nvmf_request *req) } feature = cmd->cdw10_bits.set_features.fid; + + ana_state = ctrlr->listener->ana_state; + switch (ana_state) { + case SPDK_NVME_ANA_INACCESSIBLE_STATE: + case SPDK_NVME_ANA_CHANGE_STATE: + if (cmd->nsid == SPDK_NVME_GLOBAL_NS_TAG) { + response->status.sct = SPDK_NVME_SCT_PATH; + response->status.sc = _nvme_ana_state_to_path_status(ana_state); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } else { + switch (feature) { + case SPDK_NVME_FEAT_ERROR_RECOVERY: + case SPDK_NVME_FEAT_WRITE_ATOMICITY: + case SPDK_NVME_FEAT_HOST_RESERVE_MASK: + case SPDK_NVME_FEAT_HOST_RESERVE_PERSIST: + response->status.sct = SPDK_NVME_SCT_PATH; + response->status.sc = _nvme_ana_state_to_path_status(ana_state); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + default: + break; + } + } + break; + case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: + response->status.sct = SPDK_NVME_SCT_PATH; + response->status.sc = SPDK_NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + default: + break; + } + switch (feature) { case SPDK_NVME_FEAT_ARBITRATION: return nvmf_ctrlr_set_features_arbitration(req); @@ -2315,7 +2641,8 @@ nvmf_ctrlr_process_admin_cmd(struct spdk_nvmf_request *req) } } - if (g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].hdlr) { + /* Call a custom adm cmd handler if set. Aborts are handled in a different path (see nvmf_passthru_admin_cmd) */ + if (g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].hdlr && cmd->opc != SPDK_NVME_OPC_ABORT) { rc = g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].hdlr(req); if (rc >= SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) { /* The handler took care of this commmand */ @@ -2454,6 +2781,37 @@ nvmf_ctrlr_async_event_ns_notice(struct spdk_nvmf_ctrlr *ctrlr) return nvmf_ctrlr_async_event_notification(ctrlr, &event); } +int +nvmf_ctrlr_async_event_ana_change_notice(struct spdk_nvmf_ctrlr *ctrlr) +{ + union spdk_nvme_async_event_completion event = {0}; + + /* Users may disable the event notification */ + if (!ctrlr->feat.async_event_configuration.bits.ana_change_notice) { + return 0; + } + + event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE; + event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_ANA_CHANGE; + event.bits.log_page_identifier = SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; + + /* If there is no outstanding AER request, queue the event. Then + * if an AER is later submited, this event can be sent as a + * response. + */ + if (ctrlr->nr_aer_reqs == 0) { + if (ctrlr->notice_event.bits.async_event_type == + SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) { + return 0; + } + + ctrlr->notice_event.raw = event.raw; + return 0; + } + + return nvmf_ctrlr_async_event_notification(ctrlr, &event); +} + void nvmf_ctrlr_async_event_reservation_notification(struct spdk_nvmf_ctrlr *ctrlr) { @@ -2778,6 +3136,7 @@ nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req) struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; struct spdk_nvmf_subsystem_pg_ns_info *ns_info; + enum spdk_nvme_ana_state ana_state; /* pre-set response details for this command */ response->status.sc = SPDK_NVME_SC_SUCCESS; @@ -2797,6 +3156,19 @@ nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req) return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; } + /* It will be lower overhead to check if ANA state is optimized or + * non-optimized. + */ + ana_state = ctrlr->listener->ana_state; + if (spdk_unlikely(ana_state != SPDK_NVME_ANA_OPTIMIZED_STATE && + ana_state != SPDK_NVME_ANA_NON_OPTIMIZED_STATE)) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Fail I/O command due to ANA state %d\n", + ana_state); + response->status.sct = SPDK_NVME_SCT_PATH; + response->status.sc = _nvme_ana_state_to_path_status(ana_state); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); if (ns == NULL || ns->bdev == NULL) { SPDK_ERRLOG("Unsuccessful query for nsid %u\n", cmd->nsid); @@ -2899,14 +3271,15 @@ _nvmf_request_complete(void *ctx) qpair = req->qpair; if (qpair->ctrlr) { sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id]; + assert(sgroup != NULL); is_aer = req->cmd->nvme_cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; } else if (spdk_unlikely(nvmf_request_is_fabric_connect(req))) { sgroup = nvmf_subsystem_pg_from_connect_cmd(req); } - SPDK_DEBUGLOG(SPDK_LOG_NVMF, - "cpl: cdw0=0x%08x sct=0x%01x sc=0x%02x cid=0x%04x\n", - rsp->cdw0, rsp->status.sct, rsp->status.sc, rsp->cid); + if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf")) { + spdk_nvme_print_completion(qpair->qid, rsp); + } TAILQ_REMOVE(&qpair->outstanding, req, link); if (nvmf_transport_req_complete(req)) { @@ -2942,50 +3315,6 @@ spdk_nvmf_request_complete(struct spdk_nvmf_request *req) return 0; } -static void -nvmf_trace_command(union nvmf_h2c_msg *h2c_msg, bool is_admin_queue) -{ - struct spdk_nvmf_capsule_cmd *cap_hdr = &h2c_msg->nvmf_cmd; - struct spdk_nvme_cmd *cmd = &h2c_msg->nvme_cmd; - struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; - uint8_t opc; - - if (cmd->opc == SPDK_NVME_OPC_FABRIC) { - opc = cap_hdr->fctype; - SPDK_DEBUGLOG(SPDK_LOG_NVMF, "%s Fabrics cmd: fctype 0x%02x cid %u\n", - is_admin_queue ? "Admin" : "I/O", - cap_hdr->fctype, cap_hdr->cid); - } else { - opc = cmd->opc; - SPDK_DEBUGLOG(SPDK_LOG_NVMF, "%s cmd: opc 0x%02x fuse %u cid %u nsid %u cdw10 0x%08x\n", - is_admin_queue ? "Admin" : "I/O", - cmd->opc, cmd->fuse, cmd->cid, cmd->nsid, cmd->cdw10); - if (cmd->mptr) { - SPDK_DEBUGLOG(SPDK_LOG_NVMF, "mptr 0x%" PRIx64 "\n", cmd->mptr); - } - if (cmd->psdt != SPDK_NVME_PSDT_SGL_MPTR_CONTIG && - cmd->psdt != SPDK_NVME_PSDT_SGL_MPTR_SGL) { - SPDK_DEBUGLOG(SPDK_LOG_NVMF, "psdt %u\n", cmd->psdt); - } - } - - if (spdk_nvme_opc_get_data_transfer(opc) != SPDK_NVME_DATA_NONE) { - if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK) { - SPDK_DEBUGLOG(SPDK_LOG_NVMF, - "SGL: Keyed%s: addr 0x%" PRIx64 " key 0x%x len 0x%x\n", - sgl->generic.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY ? " (Inv)" : "", - sgl->address, sgl->keyed.key, sgl->keyed.length); - } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { - SPDK_DEBUGLOG(SPDK_LOG_NVMF, "SGL: Data block: %s 0x%" PRIx64 " len 0x%x\n", - sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET ? "offs" : "addr", - sgl->address, sgl->unkeyed.length); - } else { - SPDK_DEBUGLOG(SPDK_LOG_NVMF, "SGL type 0x%x subtype 0x%x\n", - sgl->generic.type, sgl->generic.subtype); - } - } -} - static void _nvmf_request_exec(struct spdk_nvmf_request *req, struct spdk_nvmf_subsystem_poll_group *sgroup) @@ -2993,7 +3322,9 @@ _nvmf_request_exec(struct spdk_nvmf_request *req, struct spdk_nvmf_qpair *qpair = req->qpair; enum spdk_nvmf_request_exec_status status; - nvmf_trace_command(req->cmd, nvmf_qpair_is_admin_queue(qpair)); + if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf")) { + spdk_nvme_print_command(qpair->qid, &req->cmd->nvme_cmd); + } if (sgroup) { sgroup->io_outstanding++; @@ -3015,23 +3346,6 @@ _nvmf_request_exec(struct spdk_nvmf_request *req, } } -void -spdk_nvmf_request_exec_fabrics(struct spdk_nvmf_request *req) -{ - struct spdk_nvmf_qpair *qpair = req->qpair; - struct spdk_nvmf_subsystem_poll_group *sgroup = NULL; - - assert(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC); - - if (qpair->ctrlr) { - sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id]; - } else { - sgroup = nvmf_subsystem_pg_from_connect_cmd(req); - } - - _nvmf_request_exec(req, sgroup); -} - void spdk_nvmf_request_exec(struct spdk_nvmf_request *req) { @@ -3040,6 +3354,7 @@ spdk_nvmf_request_exec(struct spdk_nvmf_request *req) if (qpair->ctrlr) { sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id]; + assert(sgroup != NULL); } else if (spdk_unlikely(nvmf_request_is_fabric_connect(req))) { sgroup = nvmf_subsystem_pg_from_connect_cmd(req); } @@ -3220,3 +3535,13 @@ struct spdk_nvmf_subsystem *spdk_nvmf_ctrlr_get_subsystem(struct spdk_nvmf_ctrlr { return ctrlr->subsys; } + +uint16_t spdk_nvmf_ctrlr_get_id(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->cntlid; +} + +struct spdk_nvmf_request *spdk_nvmf_request_get_req_to_abort(struct spdk_nvmf_request *req) +{ + return req->req_to_abort; +} diff --git a/lib/nvmf/ctrlr_bdev.c b/lib/nvmf/ctrlr_bdev.c index e297fc160f6..13e0a4309f1 100644 --- a/lib/nvmf/ctrlr_bdev.c +++ b/lib/nvmf/ctrlr_bdev.c @@ -693,6 +693,39 @@ spdk_nvmf_bdev_ctrlr_nvme_passthru_admin(struct spdk_bdev *bdev, struct spdk_bde return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; } +static void +nvmf_bdev_ctrlr_complete_abort_cmd(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_nvmf_request *req = cb_arg; + + if (success) { + req->rsp->nvme_cpl.cdw0 &= ~1U; + } + + spdk_nvmf_request_complete(req); + spdk_bdev_free_io(bdev_io); +} + +int +spdk_nvmf_bdev_ctrlr_abort_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req, + struct spdk_nvmf_request *req_to_abort) +{ + int rc; + + assert((req->rsp->nvme_cpl.cdw0 & 1U) != 0); + + rc = spdk_bdev_abort(desc, ch, req_to_abort, nvmf_bdev_ctrlr_complete_abort_cmd, req); + if (spdk_likely(rc == 0)) { + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } else if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_admin_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } else { + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } +} + bool nvmf_bdev_ctrlr_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_nvme_cmd *cmd, struct spdk_dif_ctx *dif_ctx) diff --git a/lib/nvmf/fc.c b/lib/nvmf/fc.c index a2bf87b52cf..678cfc681a4 100644 --- a/lib/nvmf/fc.c +++ b/lib/nvmf/fc.c @@ -2116,6 +2116,13 @@ nvmf_fc_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, return 0; } +static void +nvmf_fc_qpair_abort_request(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_request *req) +{ + spdk_nvmf_request_complete(req); +} + const struct spdk_nvmf_transport_ops spdk_nvmf_transport_fc = { .name = "FC", .type = (enum spdk_nvme_transport_type) SPDK_NVMF_TRTYPE_FC, @@ -2140,6 +2147,7 @@ const struct spdk_nvmf_transport_ops spdk_nvmf_transport_fc = { .qpair_get_peer_trid = nvmf_fc_qpair_get_peer_trid, .qpair_get_local_trid = nvmf_fc_qpair_get_local_trid, .qpair_get_listen_trid = nvmf_fc_qpair_get_listen_trid, + .qpair_abort_request = nvmf_fc_qpair_abort_request, }; /* diff --git a/lib/nvmf/muser.c b/lib/nvmf/muser.c index e6137f05443..a1a1ff2afb3 100644 --- a/lib/nvmf/muser.c +++ b/lib/nvmf/muser.c @@ -860,6 +860,7 @@ init_qp(struct muser_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, qpair->qpair.transport = transport; qpair->ctrlr = ctrlr; qpair->qsize = qsize; + qpair->state = MUSER_QPAIR_INACTIVE; TAILQ_INIT(&qpair->reqs); @@ -1338,13 +1339,75 @@ handle_dbl_access(struct muser_ctrlr *ctrlr, uint32_t *buf, return 0; } +static int +destroy_ctrlr(struct muser_ctrlr *ctrlr) +{ + int i; + + if (ctrlr == NULL) { + return 0; + } + + for (i = 0; i < MUSER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { + destroy_qp(ctrlr, i); + } + + if (ctrlr->endpoint) { + ctrlr->endpoint->ctrlr = NULL; + } + + free(ctrlr); + return 0; +} + +static int +handle_admin_queue_connect_rsp(struct muser_req *connect_req, void *cb_arg) +{ + struct muser_poll_group *muser_group; + struct muser_req *req = cb_arg; + struct muser_qpair *qpair = cb_arg; + struct muser_ctrlr *ctrlr; + + assert(qpair != NULL); + assert(connect_req != NULL); + + muser_group = SPDK_CONTAINEROF(qpair->group, struct muser_poll_group, group); + TAILQ_INSERT_TAIL(&muser_group->qps, qpair, link); + qpair->state = MUSER_QPAIR_ACTIVE; + + ctrlr = qpair->ctrlr; + assert(ctrlr != NULL); + + if (spdk_nvme_cpl_is_error(&connect_req->req.rsp->nvme_cpl)) { + muser_req_free(&req->req); + destroy_qp(ctrlr, qpair->qpair.qid); + destroy_ctrlr(ctrlr); + return -1; + } + + if (nvmf_qpair_is_admin_queue(&qpair->qpair)) { + ctrlr->cntlid = qpair->qpair.ctrlr->cntlid; + ctrlr->ready = true; + } + + free(connect_req->req.data); + connect_req->req.data = NULL; + + /* Submit the property get/set that triggered this connect */ + spdk_nvmf_request_exec(&req->req); + + return 0; +} + static ssize_t access_bar0_fn(void *pvt, char *buf, size_t count, loff_t pos, bool is_write) { struct muser_endpoint *muser_ep = pvt; struct muser_ctrlr *ctrlr; - struct muser_req *req; + struct muser_req *req, *connect_req; + struct muser_qpair *qpair; + struct spdk_nvmf_fabric_connect_data *data; int ret; ctrlr = muser_ep->ctrlr; @@ -1370,6 +1433,11 @@ access_bar0_fn(void *pvt, char *buf, size_t count, loff_t pos, return ret; } + qpair = ctrlr->qp[0]; + + /* Mark the controller as busy to limit the queue depth for fabric get/set to 1 */ + ctrlr->ready = false; + /* Construct a Fabric Property Get/Set command and send it */ req = get_muser_req(ctrlr->qp[0]); @@ -1397,10 +1465,46 @@ access_bar0_fn(void *pvt, char *buf, size_t count, loff_t pos, req->req.length = count; req->req.data = buf; - /* Mark the controller as busy to limit the queue depth for fabric get/set to 1 */ - ctrlr->ready = false; + if (qpair->state != MUSER_QPAIR_ACTIVE) { + /* The fabric CONNECT command is sent when the first register write occurs. + * Send this first, then send the property get/set request. */ + + connect_req = get_muser_req(ctrlr->qp[0]); + if (connect_req == NULL) { + return -1; + } + + connect_req->cb_fn = handle_admin_queue_connect_rsp; + connect_req->cb_arg = req; + + connect_req->req.cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; + connect_req->req.cmd->connect_cmd.cid = connect_req->cid; + connect_req->req.cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; + connect_req->req.cmd->connect_cmd.recfmt = 0; + connect_req->req.cmd->connect_cmd.sqsize = qpair->qsize - 1; + connect_req->req.cmd->connect_cmd.qid = 0; + + connect_req->req.length = sizeof(struct spdk_nvmf_fabric_connect_data); + connect_req->req.data = calloc(1, connect_req->req.length); + if (connect_req->req.data == NULL) { + muser_req_free(&req->req); + muser_req_free(&connect_req->req); + return -ENOMEM; + } + + data = (struct spdk_nvmf_fabric_connect_data *)connect_req->req.data; + data->cntlid = 0xFFFF; + snprintf(data->subnqn, sizeof(data->subnqn), "%s", + spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); + + SPDK_DEBUGLOG(SPDK_LOG_MUSER, + "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n", + ctrlr_id(ctrlr), qpair->qpair.qid, data->cntlid); - spdk_nvmf_request_exec_fabrics(&req->req); + spdk_nvmf_request_exec(&connect_req->req); + } else { + spdk_nvmf_request_exec(&req->req); + } return count; } @@ -1591,27 +1695,6 @@ init_pci_config_space(lm_pci_config_space_t *p) p->hdr.intr.ipin = 0x1; } -static int -destroy_ctrlr(struct muser_ctrlr *ctrlr) -{ - int i; - - if (ctrlr == NULL) { - return 0; - } - - for (i = 0; i < MUSER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { - destroy_qp(ctrlr, i); - } - - if (ctrlr->endpoint) { - ctrlr->endpoint->ctrlr = NULL; - } - - free(ctrlr); - return 0; -} - static void spdk_map_dma(void *pvt, uint64_t iova, uint64_t len) { @@ -1788,6 +1871,7 @@ muser_listen(struct spdk_nvmf_transport *transport, goto out; } + unlink(path); free(path); err = ftruncate(fd, DOORBELLS + MUSER_DOORBELLS_SIZE); @@ -1872,12 +1956,10 @@ muser_stop_listen(struct spdk_nvmf_transport *transport, SPDK_DEBUGLOG(SPDK_LOG_MUSER, "%s: not found\n", trid->traddr); } -static void +static int muser_listen_associate(struct spdk_nvmf_transport *transport, const struct spdk_nvmf_subsystem *subsystem, - const struct spdk_nvme_transport_id *trid, - spdk_nvmf_tgt_subsystem_listen_done_fn cb_fn, - void *cb_arg) + const struct spdk_nvme_transport_id *trid) { struct muser_transport *mtransport; struct muser_endpoint *muser_ep; @@ -1891,13 +1973,12 @@ muser_listen_associate(struct spdk_nvmf_transport *transport, } if (muser_ep == NULL) { - cb_fn(cb_arg, -ENOENT); - return; + return -ENOENT; } muser_ep->subsystem = subsystem; - cb_fn(cb_arg, 0); + return 0; } /* @@ -2042,7 +2123,6 @@ muser_poll_group_add(struct spdk_nvmf_transport_poll_group *group, struct muser_ctrlr *muser_ctrlr; struct spdk_nvmf_request *req; struct spdk_nvmf_fabric_connect_data *data; - bool admin; muser_group = SPDK_CONTAINEROF(group, struct muser_poll_group, group); muser_qpair = SPDK_CONTAINEROF(qpair, struct muser_qpair, qpair); @@ -2053,7 +2133,10 @@ muser_poll_group_add(struct spdk_nvmf_transport_poll_group *group, ctrlr_id(muser_ctrlr), muser_qpair->qpair.qid, muser_qpair, qpair, muser_group); - admin = nvmf_qpair_is_admin_queue(&muser_qpair->qpair); + if (nvmf_qpair_is_admin_queue(&muser_qpair->qpair)) { + /* Admin queue creation is deferred to the first register write */ + return 0; + } muser_req = get_muser_req(muser_qpair); if (muser_req == NULL) { @@ -2066,7 +2149,7 @@ muser_poll_group_add(struct spdk_nvmf_transport_poll_group *group, req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; req->cmd->connect_cmd.recfmt = 0; req->cmd->connect_cmd.sqsize = muser_qpair->qsize - 1; - req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; + req->cmd->connect_cmd.qid = qpair->qid; req->length = sizeof(struct spdk_nvmf_fabric_connect_data); req->data = calloc(1, req->length); @@ -2076,7 +2159,7 @@ muser_poll_group_add(struct spdk_nvmf_transport_poll_group *group, } data = (struct spdk_nvmf_fabric_connect_data *)req->data; - data->cntlid = admin ? 0xFFFF : muser_ctrlr->cntlid; + data->cntlid = muser_ctrlr->cntlid; snprintf(data->subnqn, sizeof(data->subnqn), "%s", spdk_nvmf_subsystem_get_nqn(muser_ctrlr->endpoint->subsystem)); @@ -2087,7 +2170,7 @@ muser_poll_group_add(struct spdk_nvmf_transport_poll_group *group, "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n", ctrlr_id(muser_ctrlr), qpair->qid, data->cntlid); - spdk_nvmf_request_exec_fabrics(req); + spdk_nvmf_request_exec(req); return 0; } diff --git a/lib/nvmf/nvmf.c b/lib/nvmf/nvmf.c index fdf22cbb1f1..8cd82e5914e 100644 --- a/lib/nvmf/nvmf.c +++ b/lib/nvmf/nvmf.c @@ -100,12 +100,12 @@ nvmf_poll_group_poll(void *ctx) TAILQ_FOREACH(tgroup, &group->tgroups, link) { rc = nvmf_transport_poll_group_poll(tgroup); if (rc < 0) { - return -1; + return SPDK_POLLER_BUSY; } count += rc; } - return count; + return count > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; } static int @@ -186,6 +186,8 @@ nvmf_tgt_destroy_poll_group(void *io_device, void *ctx_buf) free(group->sgroups); + spdk_poller_unregister(&group->poller); + if (group->destroy_cb_fn) { group->destroy_cb_fn(group->destroy_cb_arg, 0); } @@ -226,8 +228,6 @@ nvmf_tgt_destroy_poll_group_qpairs(struct spdk_nvmf_poll_group *group) return; } - spdk_poller_unregister(&group->poller); - ctx->group = group; _nvmf_tgt_disconnect_next_qpair(ctx); } @@ -577,6 +577,7 @@ spdk_nvmf_tgt_write_config_json(struct spdk_json_write_ctx *w, struct spdk_nvmf_ if (transport->ops->type == SPDK_NVME_TRANSPORT_RDMA) { spdk_json_write_named_uint32(w, "max_srq_depth", transport->opts.max_srq_depth); } + spdk_json_write_named_uint32(w, "abort_timeout_sec", transport->opts.abort_timeout_sec); spdk_json_write_object_end(w); spdk_json_write_object_end(w); @@ -857,8 +858,8 @@ spdk_nvmf_poll_group_add(struct spdk_nvmf_poll_group *group, return rc; } -static -void _nvmf_ctrlr_destruct(void *ctx) +static void +_nvmf_ctrlr_destruct(void *ctx) { struct spdk_nvmf_ctrlr *ctrlr = ctx; @@ -883,8 +884,7 @@ _nvmf_ctrlr_free_from_qpair(void *ctx) spdk_bit_array_clear(ctrlr->qpair_mask, qpair_ctx->qid); count = spdk_bit_array_count_set(ctrlr->qpair_mask); if (count == 0) { - spdk_bit_array_free(&ctrlr->qpair_mask); - + ctrlr->in_destruct = true; spdk_thread_send_msg(ctrlr->subsys->thread, _nvmf_ctrlr_destruct, ctrlr); } @@ -931,6 +931,7 @@ spdk_nvmf_poll_group_remove(struct spdk_nvmf_qpair *qpair) } TAILQ_REMOVE(&qpair->group->qpairs, qpair, link); + qpair->group = NULL; } static void @@ -958,11 +959,27 @@ _nvmf_qpair_destroy(void *ctx, int status) spdk_thread_send_msg(ctrlr->thread, _nvmf_ctrlr_free_from_qpair, qpair_ctx); } +static void +_nvmf_qpair_disconnect_msg(void *ctx) +{ + struct nvmf_qpair_disconnect_ctx *qpair_ctx = ctx; + + spdk_nvmf_qpair_disconnect(qpair_ctx->qpair, qpair_ctx->cb_fn, qpair_ctx->ctx); + free(ctx); +} + int spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconnect_cb cb_fn, void *ctx) { struct nvmf_qpair_disconnect_ctx *qpair_ctx; + if (__atomic_test_and_set(&qpair->disconnect_started, __ATOMIC_RELAXED)) { + if (cb_fn) { + cb_fn(ctx); + } + return 0; + } + /* If we get a qpair in the uninitialized state, we can just destroy it immediately */ if (qpair->state == SPDK_NVMF_QPAIR_UNINITIALIZED) { nvmf_transport_qpair_fini(qpair); @@ -972,19 +989,20 @@ spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconnect_ return 0; } - /* The queue pair must be disconnected from the thread that owns it */ - assert(qpair->group->thread == spdk_get_thread()); - - if (qpair->state != SPDK_NVMF_QPAIR_ACTIVE) { - /* This can occur if the connection is killed by the target, - * which results in a notification that the connection - * died. Send a message to defer the processing of this - * callback. This allows the stack to unwind in the case - * where a bunch of connections are disconnected in - * a loop. */ - if (cb_fn) { - spdk_thread_send_msg(qpair->group->thread, cb_fn, ctx); + assert(qpair->group != NULL); + if (spdk_get_thread() != qpair->group->thread) { + /* clear the atomic so we can set it on the next call on the proper thread. */ + __atomic_clear(&qpair->disconnect_started, __ATOMIC_RELAXED); + qpair_ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_ctx)); + if (!qpair_ctx) { + SPDK_ERRLOG("Unable to allocate context for nvmf_qpair_disconnect\n"); + return -ENOMEM; } + qpair_ctx->qpair = qpair; + qpair_ctx->cb_fn = cb_fn; + qpair_ctx->thread = qpair->group->thread; + qpair_ctx->ctx = ctx; + spdk_thread_send_msg(qpair->group->thread, _nvmf_qpair_disconnect_msg, qpair_ctx); return 0; } @@ -1355,7 +1373,7 @@ nvmf_poll_group_remove_subsystem(struct spdk_nvmf_poll_group *group, _nvmf_poll_group_remove_subsystem_cb(ctx, 0); } - if (rc != 0) { + if (rc != 0 && rc != -EINPROGRESS) { free(ctx); goto fini; } @@ -1386,7 +1404,9 @@ nvmf_poll_group_pause_subsystem(struct spdk_nvmf_poll_group *group, goto fini; } - assert(sgroup->state == SPDK_NVMF_SUBSYSTEM_ACTIVE); + if (sgroup->state == SPDK_NVMF_SUBSYSTEM_PAUSED) { + goto fini; + } sgroup->state = SPDK_NVMF_SUBSYSTEM_PAUSING; if (sgroup->io_outstanding > 0) { @@ -1419,7 +1439,9 @@ nvmf_poll_group_resume_subsystem(struct spdk_nvmf_poll_group *group, sgroup = &group->sgroups[subsystem->id]; - assert(sgroup->state == SPDK_NVMF_SUBSYSTEM_PAUSED); + if (sgroup->state == SPDK_NVMF_SUBSYSTEM_ACTIVE) { + goto fini; + } rc = poll_group_update_subsystem(group, subsystem); if (rc) { diff --git a/lib/nvmf/nvmf_internal.h b/lib/nvmf/nvmf_internal.h index 7e5f0a07363..179ae676135 100644 --- a/lib/nvmf/nvmf_internal.h +++ b/lib/nvmf/nvmf_internal.h @@ -57,6 +57,7 @@ enum spdk_nvmf_subsystem_state { SPDK_NVMF_SUBSYSTEM_PAUSED, SPDK_NVMF_SUBSYSTEM_RESUMING, SPDK_NVMF_SUBSYSTEM_DEACTIVATING, + SPDK_NVMF_SUBSYSTEM_NUM_STATES, }; struct spdk_nvmf_tgt { @@ -96,6 +97,8 @@ struct spdk_nvmf_subsystem_listener { void *cb_arg; struct spdk_nvme_transport_id *trid; struct spdk_nvmf_transport *transport; + enum spdk_nvme_ana_state ana_state; + uint64_t ana_state_change_count; TAILQ_ENTRY(spdk_nvmf_subsystem_listener) link; }; @@ -217,6 +220,8 @@ struct spdk_nvmf_ctrlr { struct spdk_thread *thread; struct spdk_bit_array *qpair_mask; + const struct spdk_nvmf_subsystem_listener *listener; + struct spdk_nvmf_request *aer_req[NVMF_MAX_ASYNC_EVENTS]; union spdk_nvme_async_event_completion notice_event; union spdk_nvme_async_event_completion reservation_event; @@ -233,7 +238,10 @@ struct spdk_nvmf_ctrlr { uint64_t last_keep_alive_tick; struct spdk_poller *keep_alive_poller; + struct spdk_poller *association_timer; + bool dif_insert_or_strip; + bool in_destruct; TAILQ_ENTRY(spdk_nvmf_ctrlr) link; }; @@ -248,12 +256,16 @@ struct spdk_nvmf_subsystem { uint16_t next_cntlid; bool allow_any_host; bool allow_any_listener; + bool ana_reporting; struct spdk_nvmf_tgt *tgt; char sn[SPDK_NVME_CTRLR_SN_LEN + 1]; char mn[SPDK_NVME_CTRLR_MN_LEN + 1]; + /* boolean for state change synchronization. */ + bool changing_state; + /* Array of pointers to namespaces of size max_nsid indexed by nsid - 1 */ struct spdk_nvmf_ns **ns; uint32_t max_nsid; @@ -328,8 +340,13 @@ struct spdk_nvmf_subsystem_listener *nvmf_subsystem_find_listener( struct spdk_nvmf_listener *nvmf_transport_find_listener( struct spdk_nvmf_transport *transport, const struct spdk_nvme_transport_id *trid); +void nvmf_subsystem_set_ana_state(struct spdk_nvmf_subsystem *subsystem, + const struct spdk_nvme_transport_id *trid, + enum spdk_nvme_ana_state ana_state, + spdk_nvmf_tgt_subsystem_listen_done_fn cb_fn, void *cb_arg); int nvmf_ctrlr_async_event_ns_notice(struct spdk_nvmf_ctrlr *ctrlr); +int nvmf_ctrlr_async_event_ana_change_notice(struct spdk_nvmf_ctrlr *ctrlr); void nvmf_ctrlr_async_event_reservation_notification(struct spdk_nvmf_ctrlr *ctrlr); void nvmf_ns_reservation_request(void *ctx); void nvmf_ctrlr_reservation_notice_log(struct spdk_nvmf_ctrlr *ctrlr, @@ -351,6 +368,8 @@ void nvmf_ctrlr_abort_aer(struct spdk_nvmf_ctrlr *ctrlr); */ void nvmf_qpair_free_aer(struct spdk_nvmf_qpair *qpair); +int nvmf_ctrlr_abort_request(struct spdk_nvmf_request *req); + static inline struct spdk_nvmf_ns * _nvmf_subsystem_get_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid) { diff --git a/lib/nvmf/nvmf_rpc.c b/lib/nvmf/nvmf_rpc.c index 075873245c1..044fa2ad243 100644 --- a/lib/nvmf/nvmf_rpc.c +++ b/lib/nvmf/nvmf_rpc.c @@ -39,6 +39,7 @@ #include "spdk/nvmf.h" #include "spdk/string.h" #include "spdk/util.h" +#include "spdk/bit_array.h" #include "spdk_internal/log.h" #include "spdk_internal/assert.h" @@ -343,6 +344,7 @@ struct rpc_subsystem_create { char *tgt_name; uint32_t max_namespaces; bool allow_any_host; + bool ana_reporting; }; static const struct spdk_json_object_decoder rpc_subsystem_create_decoders[] = { @@ -352,6 +354,7 @@ static const struct spdk_json_object_decoder rpc_subsystem_create_decoders[] = { {"tgt_name", offsetof(struct rpc_subsystem_create, tgt_name), spdk_json_decode_string, true}, {"max_namespaces", offsetof(struct rpc_subsystem_create, max_namespaces), spdk_json_decode_uint32, true}, {"allow_any_host", offsetof(struct rpc_subsystem_create, allow_any_host), spdk_json_decode_bool, true}, + {"ana_reporting", offsetof(struct rpc_subsystem_create, ana_reporting), spdk_json_decode_bool, true}, }; static void @@ -434,6 +437,8 @@ rpc_nvmf_create_subsystem(struct spdk_jsonrpc_request *request, spdk_nvmf_subsystem_set_allow_any_host(subsystem, req->allow_any_host); + spdk_nvmf_subsystem_set_ana_reporting(subsystem, req->ana_reporting); + rc = spdk_nvmf_subsystem_start(subsystem, rpc_nvmf_subsystem_started, request); @@ -491,6 +496,7 @@ rpc_nvmf_delete_subsystem(struct spdk_jsonrpc_request *request, struct rpc_delete_subsystem req = { 0 }; struct spdk_nvmf_subsystem *subsystem; struct spdk_nvmf_tgt *tgt; + int rc; if (spdk_json_decode_object(params, rpc_delete_subsystem_decoders, SPDK_COUNTOF(rpc_delete_subsystem_decoders), @@ -518,9 +524,18 @@ rpc_nvmf_delete_subsystem(struct spdk_jsonrpc_request *request, free_rpc_delete_subsystem(&req); - spdk_nvmf_subsystem_stop(subsystem, - rpc_nvmf_subsystem_stopped, - request); + rc = spdk_nvmf_subsystem_stop(subsystem, + rpc_nvmf_subsystem_stopped, + request); + if (rc == -EBUSY) { + SPDK_ERRLOG("Subsystem currently in another state change try again later.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Subsystem currently in another state change try again later."); + } else if (rc != 0) { + SPDK_ERRLOG("Unable to change state on subsystem. rc=%d\n", rc); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to change state on subsystem. rc=%d", rc); + } return; @@ -539,14 +554,6 @@ struct rpc_listen_address { char *trsvcid; }; -#define RPC_MAX_LISTEN_ADDRESSES 255 -#define RPC_MAX_NAMESPACES 255 - -struct rpc_listen_addresses { - size_t num_listen_address; - struct rpc_listen_address addresses[RPC_MAX_LISTEN_ADDRESSES]; -}; - static const struct spdk_json_object_decoder rpc_listen_address_decoders[] = { /* NOTE: "transport" is kept for compatibility; new code should use "trtype" */ {"transport", offsetof(struct rpc_listen_address, transport), spdk_json_decode_string, true}, @@ -581,14 +588,18 @@ free_rpc_listen_address(struct rpc_listen_address *r) enum nvmf_rpc_listen_op { NVMF_RPC_LISTEN_ADD, NVMF_RPC_LISTEN_REMOVE, + NVMF_RPC_LISTEN_SET_ANA_STATE, }; struct nvmf_rpc_listener_ctx { char *nqn; char *tgt_name; struct spdk_nvmf_tgt *tgt; + struct spdk_nvmf_transport *transport; struct spdk_nvmf_subsystem *subsystem; struct rpc_listen_address address; + char *ana_state_str; + enum spdk_nvme_ana_state ana_state; struct spdk_jsonrpc_request *request; struct spdk_nvme_transport_id trid; @@ -608,6 +619,7 @@ nvmf_rpc_listener_ctx_free(struct nvmf_rpc_listener_ctx *ctx) free(ctx->nqn); free(ctx->tgt_name); free_rpc_listen_address(&ctx->address); + free(ctx->ana_state_str); free(ctx); } @@ -650,7 +662,51 @@ nvmf_rpc_subsystem_listen(void *cb_arg, int status) if (spdk_nvmf_subsystem_resume(ctx->subsystem, nvmf_rpc_listen_resumed, ctx)) { if (!ctx->response_sent) { - spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Internal error"); + } + nvmf_rpc_listener_ctx_free(ctx); + /* Can't really do anything to recover here - subsystem will remain paused. */ + } +} +static void +nvmf_rpc_stop_listen_async_done(void *cb_arg, int status) +{ + struct nvmf_rpc_listener_ctx *ctx = cb_arg; + + if (status) { + SPDK_ERRLOG("Unable to stop listener.\n"); + spdk_jsonrpc_send_error_response_fmt(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "error stopping listener: %d", status); + ctx->response_sent = true; + } + + if (spdk_nvmf_subsystem_resume(ctx->subsystem, nvmf_rpc_listen_resumed, ctx)) { + if (!ctx->response_sent) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Internal error"); + } + nvmf_rpc_listener_ctx_free(ctx); + /* Can't really do anything to recover here - subsystem will remain paused. */ + } +} + +static void +nvmf_rpc_set_ana_state_done(void *cb_arg, int status) +{ + struct nvmf_rpc_listener_ctx *ctx = cb_arg; + + if (status) { + SPDK_ERRLOG("Unable to set ANA state.\n"); + spdk_jsonrpc_send_error_response_fmt(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "error setting ANA state: %d", status); + ctx->response_sent = true; + } + + if (spdk_nvmf_subsystem_resume(ctx->subsystem, nvmf_rpc_listen_resumed, ctx)) { + if (!ctx->response_sent) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Internal error"); } nvmf_rpc_listener_ctx_free(ctx); /* Can't really do anything to recover here - subsystem will remain paused. */ @@ -683,14 +739,21 @@ nvmf_rpc_listen_paused(struct spdk_nvmf_subsystem *subsystem, "Invalid parameters"); ctx->response_sent = true; } - spdk_nvmf_tgt_stop_listen(ctx->tgt, &ctx->trid); + spdk_nvmf_transport_stop_listen_async(ctx->transport, &ctx->trid, nvmf_rpc_stop_listen_async_done, + ctx); + return; + } else if (ctx->op == NVMF_RPC_LISTEN_SET_ANA_STATE) { + nvmf_subsystem_set_ana_state(subsystem, &ctx->trid, ctx->ana_state, + nvmf_rpc_set_ana_state_done, ctx); + return; } else { SPDK_UNREACHABLE(); } if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_listen_resumed, ctx)) { if (!ctx->response_sent) { - spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Internal error"); } nvmf_rpc_listener_ctx_free(ctx); /* Can't really do anything to recover here - subsystem will remain paused. */ @@ -750,6 +813,7 @@ rpc_nvmf_subsystem_add_listener(struct spdk_jsonrpc_request *request, struct nvmf_rpc_listener_ctx *ctx; struct spdk_nvmf_subsystem *subsystem; struct spdk_nvmf_tgt *tgt; + int rc; ctx = calloc(1, sizeof(*ctx)); if (!ctx) { @@ -797,8 +861,14 @@ rpc_nvmf_subsystem_add_listener(struct spdk_jsonrpc_request *request, ctx->op = NVMF_RPC_LISTEN_ADD; - if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_listen_paused, ctx)) { - spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + rc = spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_listen_paused, ctx); + if (rc != 0) { + if (rc == -EBUSY) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "subsystem busy, retry later.\n"); + } else { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } nvmf_rpc_listener_ctx_free(ctx); } } @@ -812,6 +882,7 @@ rpc_nvmf_subsystem_remove_listener(struct spdk_jsonrpc_request *request, struct nvmf_rpc_listener_ctx *ctx; struct spdk_nvmf_subsystem *subsystem; struct spdk_nvmf_tgt *tgt; + int rc; ctx = calloc(1, sizeof(*ctx)); if (!ctx) { @@ -857,16 +928,132 @@ rpc_nvmf_subsystem_remove_listener(struct spdk_jsonrpc_request *request, return; } + ctx->transport = spdk_nvmf_tgt_get_transport(tgt, ctx->trid.trstring); + if (!ctx->transport) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + ctx->op = NVMF_RPC_LISTEN_REMOVE; - if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_listen_paused, ctx)) { - spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + rc = spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_listen_paused, ctx); + if (rc != 0) { + if (rc == -EBUSY) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "subsystem busy, retry later.\n"); + } else { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } nvmf_rpc_listener_ctx_free(ctx); } } SPDK_RPC_REGISTER("nvmf_subsystem_remove_listener", rpc_nvmf_subsystem_remove_listener, SPDK_RPC_RUNTIME); +static const struct spdk_json_object_decoder nvmf_rpc_set_ana_state_decoder[] = { + {"nqn", offsetof(struct nvmf_rpc_listener_ctx, nqn), spdk_json_decode_string}, + {"listen_address", offsetof(struct nvmf_rpc_listener_ctx, address), decode_rpc_listen_address}, + {"ana_state", offsetof(struct nvmf_rpc_listener_ctx, ana_state_str), spdk_json_decode_string}, + {"tgt_name", offsetof(struct nvmf_rpc_listener_ctx, tgt_name), spdk_json_decode_string, true}, +}; + +static int +rpc_ana_state_parse(const char *str, enum spdk_nvme_ana_state *ana_state) +{ + if (ana_state == NULL || str == NULL) { + return -EINVAL; + } + + if (strcasecmp(str, "optimized") == 0) { + *ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; + } else if (strcasecmp(str, "non_optimized") == 0) { + *ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE; + } else if (strcasecmp(str, "inaccessible") == 0) { + *ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE; + } else { + return -ENOENT; + } + + return 0; +} + +static void +rpc_nvmf_subsystem_listener_set_ana_state(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_listener_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_tgt *tgt; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Out of memory"); + return; + } + + ctx->request = request; + + if (spdk_json_decode_object(params, nvmf_rpc_set_ana_state_decoder, + SPDK_COUNTOF(nvmf_rpc_set_ana_state_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + tgt = spdk_nvmf_get_tgt(ctx->tgt_name); + if (!tgt) { + SPDK_ERRLOG("Unable to find a target object.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target.\n"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + ctx->tgt = tgt; + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Unable to find subsystem with NQN %s", + ctx->nqn); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + ctx->subsystem = subsystem; + + if (rpc_listen_address_to_trid(&ctx->address, &ctx->trid)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + if (rpc_ana_state_parse(ctx->ana_state_str, &ctx->ana_state)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + ctx->op = NVMF_RPC_LISTEN_SET_ANA_STATE; + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_listen_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Internal error"); + nvmf_rpc_listener_ctx_free(ctx); + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_listener_set_ana_state", + rpc_nvmf_subsystem_listener_set_ana_state, SPDK_RPC_RUNTIME); + struct spdk_nvmf_ns_params { char *bdev_name; char *ptpl_file; @@ -876,12 +1063,6 @@ struct spdk_nvmf_ns_params { struct spdk_uuid uuid; }; -struct rpc_namespaces { - size_t num_ns; - struct spdk_nvmf_ns_params ns_params[RPC_MAX_NAMESPACES]; -}; - - static const struct spdk_json_object_decoder rpc_ns_params_decoders[] = { {"nsid", offsetof(struct spdk_nvmf_ns_params, nsid), spdk_json_decode_uint32, true}, {"bdev_name", offsetof(struct spdk_nvmf_ns_params, bdev_name), spdk_json_decode_string}, @@ -926,6 +1107,24 @@ nvmf_rpc_ns_ctx_free(struct nvmf_rpc_ns_ctx *ctx) free(ctx); } +static void +nvmf_rpc_ns_failback_resumed(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_ns_ctx *ctx = cb_arg; + struct spdk_jsonrpc_request *request = ctx->request; + + if (status) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to add ns, subsystem in invalid state"); + } else { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to add ns, subsystem in active state"); + } + + nvmf_rpc_ns_ctx_free(ctx); +} + static void nvmf_rpc_ns_resumed(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status) @@ -935,6 +1134,27 @@ nvmf_rpc_ns_resumed(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid = ctx->ns_params.nsid; bool response_sent = ctx->response_sent; struct spdk_json_write_ctx *w; + int rc; + + /* The case where the call to add the namespace was successful, but the subsystem couldn't be resumed. */ + if (status && !ctx->response_sent) { + rc = spdk_nvmf_subsystem_remove_ns(subsystem, nsid); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to add ns, subsystem in invalid state"); + nvmf_rpc_ns_ctx_free(ctx); + return; + } + + rc = spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_ns_failback_resumed, ctx); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_ns_ctx_free(ctx); + return; + } + + return; + } nvmf_rpc_ns_ctx_free(ctx); @@ -1001,6 +1221,7 @@ rpc_nvmf_subsystem_add_ns(struct spdk_jsonrpc_request *request, struct nvmf_rpc_ns_ctx *ctx; struct spdk_nvmf_subsystem *subsystem; struct spdk_nvmf_tgt *tgt; + int rc; ctx = calloc(1, sizeof(*ctx)); if (!ctx) { @@ -1037,8 +1258,14 @@ rpc_nvmf_subsystem_add_ns(struct spdk_jsonrpc_request *request, return; } - if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_ns_paused, ctx)) { - spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + rc = spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_ns_paused, ctx); + if (rc != 0) { + if (rc == -EBUSY) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "subsystem busy, retry later.\n"); + } else { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } nvmf_rpc_ns_ctx_free(ctx); } } @@ -1117,6 +1344,7 @@ rpc_nvmf_subsystem_remove_ns(struct spdk_jsonrpc_request *request, struct nvmf_rpc_remove_ns_ctx *ctx; struct spdk_nvmf_subsystem *subsystem; struct spdk_nvmf_tgt *tgt; + int rc; ctx = calloc(1, sizeof(*ctx)); if (!ctx) { @@ -1153,8 +1381,14 @@ rpc_nvmf_subsystem_remove_ns(struct spdk_jsonrpc_request *request, return; } - if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_remove_ns_paused, ctx)) { - spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + rc = spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_remove_ns_paused, ctx); + if (rc != 0) { + if (rc == -EBUSY) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "subsystem busy, retry later.\n"); + } else { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } nvmf_rpc_remove_ns_ctx_free(ctx); } } @@ -1255,6 +1489,7 @@ rpc_nvmf_subsystem_add_host(struct spdk_jsonrpc_request *request, struct nvmf_rpc_host_ctx *ctx; struct spdk_nvmf_subsystem *subsystem; struct spdk_nvmf_tgt *tgt; + int rc; ctx = calloc(1, sizeof(*ctx)); if (!ctx) { @@ -1292,8 +1527,14 @@ rpc_nvmf_subsystem_add_host(struct spdk_jsonrpc_request *request, return; } - if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) { - spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + rc = spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx); + if (rc != 0) { + if (rc == -EBUSY) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "subsystem busy, retry later.\n"); + } else { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } nvmf_rpc_host_ctx_free(ctx); } } @@ -1306,6 +1547,7 @@ rpc_nvmf_subsystem_remove_host(struct spdk_jsonrpc_request *request, struct nvmf_rpc_host_ctx *ctx; struct spdk_nvmf_subsystem *subsystem; struct spdk_nvmf_tgt *tgt; + int rc; ctx = calloc(1, sizeof(*ctx)); if (!ctx) { @@ -1343,8 +1585,14 @@ rpc_nvmf_subsystem_remove_host(struct spdk_jsonrpc_request *request, return; } - if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) { - spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + rc = spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx); + if (rc != 0) { + if (rc == -EBUSY) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "subsystem busy, retry later.\n"); + } else { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } nvmf_rpc_host_ctx_free(ctx); } } @@ -1365,6 +1613,7 @@ rpc_nvmf_subsystem_allow_any_host(struct spdk_jsonrpc_request *request, struct nvmf_rpc_host_ctx *ctx; struct spdk_nvmf_subsystem *subsystem; struct spdk_nvmf_tgt *tgt; + int rc; ctx = calloc(1, sizeof(*ctx)); if (!ctx) { @@ -1402,8 +1651,14 @@ rpc_nvmf_subsystem_allow_any_host(struct spdk_jsonrpc_request *request, return; } - if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) { - spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + rc = spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx); + if (rc != 0) { + if (rc == -EBUSY) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "subsystem busy, retry later.\n"); + } else { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } nvmf_rpc_host_ctx_free(ctx); } } @@ -1646,6 +1901,14 @@ static const struct spdk_json_object_decoder nvmf_rpc_create_transport_decoder[] "sock_priority", offsetof(struct nvmf_rpc_create_transport_ctx, opts.sock_priority), spdk_json_decode_uint32, true }, + { + "acceptor_backlog", offsetof(struct nvmf_rpc_create_transport_ctx, opts.acceptor_backlog), + spdk_json_decode_int32, true + }, + { + "abort_timeout_sec", offsetof(struct nvmf_rpc_create_transport_ctx, opts.abort_timeout_sec), + spdk_json_decode_uint32, true + }, { "tgt_name", offsetof(struct nvmf_rpc_create_transport_ctx, tgt_name), spdk_json_decode_string, true @@ -1793,10 +2056,12 @@ dump_nvmf_transport(struct spdk_json_write_ctx *w, struct spdk_nvmf_transport *t if (type == SPDK_NVME_TRANSPORT_RDMA) { spdk_json_write_named_uint32(w, "max_srq_depth", opts->max_srq_depth); spdk_json_write_named_bool(w, "no_srq", opts->no_srq); + spdk_json_write_named_int32(w, "acceptor_backlog", opts->acceptor_backlog); } else if (type == SPDK_NVME_TRANSPORT_TCP) { spdk_json_write_named_bool(w, "c2h_success", opts->c2h_success); spdk_json_write_named_uint32(w, "sock_priority", opts->sock_priority); } + spdk_json_write_named_uint32(w, "abort_timeout_sec", opts->abort_timeout_sec); spdk_json_write_object_end(w); } @@ -2000,3 +2265,322 @@ rpc_nvmf_get_stats(struct spdk_jsonrpc_request *request, } SPDK_RPC_REGISTER("nvmf_get_stats", rpc_nvmf_get_stats, SPDK_RPC_RUNTIME) + +static void +dump_nvmf_ctrlr(struct spdk_json_write_ctx *w, struct spdk_nvmf_ctrlr *ctrlr) +{ + char uuid_str[SPDK_UUID_STRING_LEN] = {}; + uint32_t count; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_uint32(w, "cntlid", ctrlr->cntlid); + + spdk_json_write_named_string(w, "hostnqn", ctrlr->hostnqn); + + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &ctrlr->hostid); + spdk_json_write_named_string(w, "hostid", uuid_str); + + count = spdk_bit_array_count_set(ctrlr->qpair_mask); + spdk_json_write_named_uint32(w, "num_io_qpairs", count); + + spdk_json_write_object_end(w); +} + +static const char * +nvmf_qpair_state_str(enum spdk_nvmf_qpair_state state) +{ + switch (state) { + case SPDK_NVMF_QPAIR_UNINITIALIZED: + return "uninitialized"; + case SPDK_NVMF_QPAIR_ACTIVE: + return "active"; + case SPDK_NVMF_QPAIR_DEACTIVATING: + return "deactivating"; + case SPDK_NVMF_QPAIR_ERROR: + return "error"; + default: + return NULL; + } +} + +static void +dump_nvmf_qpair(struct spdk_json_write_ctx *w, struct spdk_nvmf_qpair *qpair) +{ + const struct spdk_nvme_transport_id *trid = qpair->trid; + const char *adrfam; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_uint32(w, "cntlid", qpair->ctrlr->cntlid); + spdk_json_write_named_uint32(w, "qid", qpair->qid); + spdk_json_write_named_string(w, "state", nvmf_qpair_state_str(qpair->state)); + + spdk_json_write_named_object_begin(w, "listen_address"); + adrfam = spdk_nvme_transport_id_adrfam_str(trid->adrfam); + if (adrfam == NULL) { + adrfam = "unknown"; + } + spdk_json_write_named_string(w, "trtype", trid->trstring); + spdk_json_write_named_string(w, "adrfam", adrfam); + spdk_json_write_named_string(w, "traddr", trid->traddr); + spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const char * +nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) +{ + switch (ana_state) { + case SPDK_NVME_ANA_OPTIMIZED_STATE: + return "optimized"; + case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: + return "non_optimized"; + case SPDK_NVME_ANA_INACCESSIBLE_STATE: + return "inaccessible"; + case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: + return "persistent_loss"; + case SPDK_NVME_ANA_CHANGE_STATE: + return "change"; + default: + return NULL; + } +} + +static void +dump_nvmf_subsystem_listener(struct spdk_json_write_ctx *w, + struct spdk_nvmf_subsystem_listener *listener) +{ + const struct spdk_nvme_transport_id *trid = listener->trid; + const char *adrfam; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_object_begin(w, "address"); + adrfam = spdk_nvme_transport_id_adrfam_str(trid->adrfam); + if (adrfam == NULL) { + adrfam = "unknown"; + } + spdk_json_write_named_string(w, "trtype", trid->trstring); + spdk_json_write_named_string(w, "adrfam", adrfam); + spdk_json_write_named_string(w, "traddr", trid->traddr); + spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); + spdk_json_write_object_end(w); + + spdk_json_write_named_string(w, "ana_state", + nvme_ana_state_str(listener->ana_state)); + + spdk_json_write_object_end(w); +} + +struct rpc_subsystem_query_ctx { + char *nqn; + char *tgt_name; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_jsonrpc_request *request; + struct spdk_json_write_ctx *w; +}; + +static const struct spdk_json_object_decoder rpc_subsystem_query_decoders[] = { + {"nqn", offsetof(struct rpc_subsystem_query_ctx, nqn), spdk_json_decode_string}, + {"tgt_name", offsetof(struct rpc_subsystem_query_ctx, tgt_name), spdk_json_decode_string, true}, +}; + +static void +free_rpc_subsystem_query_ctx(struct rpc_subsystem_query_ctx *ctx) +{ + free(ctx->nqn); + free(ctx->tgt_name); + free(ctx); +} + +static void +rpc_nvmf_get_controllers_paused(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct rpc_subsystem_query_ctx *ctx = cb_arg; + struct spdk_json_write_ctx *w; + struct spdk_nvmf_ctrlr *ctrlr; + + w = spdk_jsonrpc_begin_result(ctx->request); + + spdk_json_write_array_begin(w); + TAILQ_FOREACH(ctrlr, &ctx->subsystem->ctrlrs, link) { + dump_nvmf_ctrlr(w, ctrlr); + } + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(ctx->request, w); + + if (spdk_nvmf_subsystem_resume(ctx->subsystem, NULL, NULL)) { + SPDK_ERRLOG("Resuming subsystem with NQN %s failed\n", ctx->nqn); + /* FIXME: RPC should fail if resuming the subsystem failed. */ + } + + free_rpc_subsystem_query_ctx(ctx); +} + +static void +rpc_nvmf_get_qpairs_done(struct spdk_io_channel_iter *i, int status) +{ + struct rpc_subsystem_query_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + spdk_json_write_array_end(ctx->w); + spdk_jsonrpc_end_result(ctx->request, ctx->w); + + if (spdk_nvmf_subsystem_resume(ctx->subsystem, NULL, NULL)) { + SPDK_ERRLOG("Resuming subsystem with NQN %s failed\n", ctx->nqn); + /* FIXME: RPC should fail if resuming the subsystem failed. */ + } + + free_rpc_subsystem_query_ctx(ctx); +} + +static void +rpc_nvmf_get_qpairs(struct spdk_io_channel_iter *i) +{ + struct rpc_subsystem_query_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *ch; + struct spdk_nvmf_poll_group *group; + struct spdk_nvmf_qpair *qpair; + + ch = spdk_get_io_channel(ctx->subsystem->tgt); + group = spdk_io_channel_get_ctx(ch); + + TAILQ_FOREACH(qpair, &group->qpairs, link) { + if (qpair->ctrlr->subsys == ctx->subsystem) { + dump_nvmf_qpair(ctx->w, qpair); + } + } + + spdk_for_each_channel_continue(i, 0); +} + +static void +rpc_nvmf_get_qpairs_paused(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct rpc_subsystem_query_ctx *ctx = cb_arg; + + ctx->w = spdk_jsonrpc_begin_result(ctx->request); + + spdk_json_write_array_begin(ctx->w); + + spdk_for_each_channel(ctx->subsystem->tgt, + rpc_nvmf_get_qpairs, + ctx, + rpc_nvmf_get_qpairs_done); +} + +static void +rpc_nvmf_get_listeners_paused(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct rpc_subsystem_query_ctx *ctx = cb_arg; + struct spdk_json_write_ctx *w; + struct spdk_nvmf_subsystem_listener *listener; + + w = spdk_jsonrpc_begin_result(ctx->request); + + spdk_json_write_array_begin(w); + + for (listener = spdk_nvmf_subsystem_get_first_listener(ctx->subsystem); + listener != NULL; + listener = spdk_nvmf_subsystem_get_next_listener(ctx->subsystem, listener)) { + dump_nvmf_subsystem_listener(w, listener); + } + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(ctx->request, w); + + if (spdk_nvmf_subsystem_resume(ctx->subsystem, NULL, NULL)) { + SPDK_ERRLOG("Resuming subsystem with NQN %s failed\n", ctx->nqn); + /* FIXME: RPC should fail if resuming the subsystem failed. */ + } + + free_rpc_subsystem_query_ctx(ctx); +} + +static void +_rpc_nvmf_subsystem_query(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params, + spdk_nvmf_subsystem_state_change_done cb_fn) +{ + struct rpc_subsystem_query_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_tgt *tgt; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Out of memory"); + return; + } + + ctx->request = request; + + if (spdk_json_decode_object(params, rpc_subsystem_query_decoders, + SPDK_COUNTOF(rpc_subsystem_query_decoders), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_subsystem_query_ctx(ctx); + return; + } + + tgt = spdk_nvmf_get_tgt(ctx->tgt_name); + if (!tgt) { + SPDK_ERRLOG("Unable to find a target object.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target"); + free_rpc_subsystem_query_ctx(ctx); + return; + } + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_subsystem_query_ctx(ctx); + return; + } + + ctx->subsystem = subsystem; + + if (spdk_nvmf_subsystem_pause(subsystem, cb_fn, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Internal error"); + free_rpc_subsystem_query_ctx(ctx); + return; + } +} + +static void +rpc_nvmf_subsystem_get_controllers(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + _rpc_nvmf_subsystem_query(request, params, rpc_nvmf_get_controllers_paused); +} +SPDK_RPC_REGISTER("nvmf_subsystem_get_controllers", rpc_nvmf_subsystem_get_controllers, + SPDK_RPC_RUNTIME); + +static void +rpc_nvmf_subsystem_get_qpairs(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + _rpc_nvmf_subsystem_query(request, params, rpc_nvmf_get_qpairs_paused); +} +SPDK_RPC_REGISTER("nvmf_subsystem_get_qpairs", rpc_nvmf_subsystem_get_qpairs, SPDK_RPC_RUNTIME); + +static void +rpc_nvmf_subsystem_get_listeners(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + _rpc_nvmf_subsystem_query(request, params, rpc_nvmf_get_listeners_paused); +} +SPDK_RPC_REGISTER("nvmf_subsystem_get_listeners", rpc_nvmf_subsystem_get_listeners, + SPDK_RPC_RUNTIME); diff --git a/lib/nvmf/rdma.c b/lib/nvmf/rdma.c index 2141f72fa33..cbc003e1f0e 100644 --- a/lib/nvmf/rdma.c +++ b/lib/nvmf/rdma.c @@ -45,6 +45,8 @@ #include "spdk_internal/log.h" #include "spdk_internal/rdma.h" +#include "nvmf_internal.h" + struct spdk_nvme_rdma_hooks g_nvmf_hooks = {}; const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma; @@ -59,9 +61,6 @@ const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma; #define DEFAULT_NVMF_RDMA_CQ_SIZE 4096 #define MAX_WR_PER_QP(queue_depth) (queue_depth * 3 + 2) -/* Timeout for destroying defunct rqpairs */ -#define NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US 4000000 - static int g_spdk_nvmf_ibv_query_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | @@ -264,12 +263,6 @@ struct spdk_nvmf_rdma_request { STAILQ_ENTRY(spdk_nvmf_rdma_request) state_link; }; -enum spdk_nvmf_rdma_qpair_disconnect_flags { - RDMA_QP_DISCONNECTING = 1, - RDMA_QP_RECV_DRAINED = 1 << 1, - RDMA_QP_SEND_DRAINED = 1 << 2 -}; - struct spdk_nvmf_rdma_resource_opts { struct spdk_nvmf_rdma_qpair *qpair; /* qp points either to an ibv_qp object or an ibv_srq object depending on the value of shared. */ @@ -280,11 +273,6 @@ struct spdk_nvmf_rdma_resource_opts { bool shared; }; -struct spdk_nvmf_send_wr_list { - struct ibv_send_wr *first; - struct ibv_send_wr *last; -}; - struct spdk_nvmf_recv_wr_list { struct ibv_recv_wr *first; struct ibv_recv_wr *last; @@ -393,24 +381,19 @@ struct spdk_nvmf_rdma_qpair { */ enum ibv_qp_state ibv_state; - uint32_t disconnect_flags; - - /* Poller registered in case the qpair doesn't properly - * complete the qpair destruct process and becomes defunct. + /* + * io_channel which is used to destroy qpair when it is removed from poll group */ - - struct spdk_poller *destruct_poller; + struct spdk_io_channel *destruct_channel; /* List of ibv async events */ STAILQ_HEAD(, spdk_nvmf_rdma_ibv_event_ctx) ibv_events; - /* There are several ways a disconnect can start on a qpair - * and they are not all mutually exclusive. It is important - * that we only initialize one of these paths. - */ - bool disconnect_started; /* Lets us know that we have received the last_wqe event. */ bool last_wqe_reached; + + /* Indicate that nvmf_rdma_close_qpair is called */ + bool to_close; }; struct spdk_nvmf_rdma_poller_stat { @@ -510,9 +493,6 @@ struct spdk_nvmf_rdma_transport { TAILQ_HEAD(, spdk_nvmf_rdma_poll_group) poll_groups; }; -static inline void -nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair); - static bool nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, struct spdk_nvmf_rdma_request *rdma_req); @@ -843,8 +823,6 @@ nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair->cm_id, 0); - spdk_poller_unregister(&rqpair->destruct_poller); - if (rqpair->qd != 0) { struct spdk_nvmf_qpair *qpair = &rqpair->qpair; struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(qpair->transport, @@ -908,6 +886,11 @@ nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) nvmf_rdma_qpair_clean_ibv_events(rqpair); + if (rqpair->destruct_channel) { + spdk_put_io_channel(rqpair->destruct_channel); + rqpair->destruct_channel = NULL; + } + free(rqpair); } @@ -1279,6 +1262,7 @@ nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *e rqpair->cm_id = event->id; rqpair->listen_id = event->listen_id; rqpair->qpair.transport = transport; + rqpair->qpair.trid = port->trid; STAILQ_INIT(&rqpair->ibv_events); /* use qid from the private data to determine the qpair type qid will be set to the appropriate value when the controller is created */ @@ -1989,6 +1973,14 @@ nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, /* The next state transition depends on the data transfer needs of this request. */ rdma_req->req.xfer = spdk_nvmf_req_get_xfer(&rdma_req->req); + if (spdk_unlikely(rdma_req->req.xfer == SPDK_NVME_DATA_BIDIRECTIONAL)) { + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_OPCODE; + rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p: invalid xfer type (BIDIRECTIONAL)\n", rdma_req); + break; + } + /* If no data to transfer, ready to execute. */ if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) { rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; @@ -2084,7 +2076,7 @@ nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, if (rc != 0) { SPDK_ERRLOG("DIF generation failed\n"); rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; - nvmf_rdma_start_disconnect(rqpair); + spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); break; } } @@ -2219,6 +2211,8 @@ nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, #define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32 #define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false #define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false +#define SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG 100 +#define SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC 1 static void nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) @@ -2234,6 +2228,8 @@ nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) opts->max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH; opts->no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ; opts->dif_insert_or_strip = SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP; + opts->acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG; + opts->abort_timeout_sec = SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC; } const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = { @@ -2294,7 +2290,8 @@ nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n" " max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" " in_capsule_data_size=%d, max_aq_depth=%d,\n" - " num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d\n", + " num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d," + " acceptor_backlog=%d, abort_timeout_sec=%d\n", opts->max_queue_depth, opts->max_io_size, opts->max_qpairs_per_ctrlr - 1, @@ -2303,13 +2300,21 @@ nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) opts->max_aq_depth, opts->num_shared_buffers, opts->max_srq_depth, - opts->no_srq); + opts->no_srq, + opts->acceptor_backlog, + opts->abort_timeout_sec); /* I/O unit size cannot be larger than max I/O size */ if (opts->io_unit_size > opts->max_io_size) { opts->io_unit_size = opts->max_io_size; } + if (opts->acceptor_backlog <= 0) { + SPDK_ERRLOG("The acceptor backlog cannot be less than 1, setting to the default value of (%d).\n", + SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG); + opts->acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG; + } + if (opts->num_shared_buffers < (SPDK_NVMF_MAX_SGL_ENTRIES * 2)) { SPDK_ERRLOG("The number of shared data buffers (%d) is less than" "the minimum number required to guarantee that forward progress can be made (%d)\n", @@ -2318,7 +2323,7 @@ nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) return NULL; } - min_shared_buffers = spdk_thread_get_count() * opts->buf_cache_size; + min_shared_buffers = spdk_env_get_core_count() * opts->buf_cache_size; if (min_shared_buffers > opts->num_shared_buffers) { SPDK_ERRLOG("There are not enough buffers to satisfy" "per-poll group caches for each thread. (%" PRIu32 ")" @@ -2625,7 +2630,7 @@ nvmf_rdma_listen(struct spdk_nvmf_transport *transport, return -1; } - rc = rdma_listen(port->id, 10); /* 10 = backlog */ + rc = rdma_listen(port->id, transport->opts.acceptor_backlog); if (rc < 0) { SPDK_ERRLOG("rdma_listen() failed\n"); rdma_destroy_id(port->id); @@ -2735,50 +2740,18 @@ nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport, } static void -_nvmf_rdma_qpair_disconnect(void *ctx) +nvmf_rdma_destroy_drained_qpair(struct spdk_nvmf_rdma_qpair *rqpair) { - struct spdk_nvmf_qpair *qpair = ctx; - - spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); -} + struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, + struct spdk_nvmf_rdma_transport, transport); -static void -_nvmf_rdma_try_disconnect(void *ctx) -{ - struct spdk_nvmf_qpair *qpair = ctx; - struct spdk_nvmf_poll_group *group; + nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); - /* Read the group out of the qpair. This is normally set and accessed only from - * the thread that created the group. Here, we're not on that thread necessarily. - * The data member qpair->group begins it's life as NULL and then is assigned to - * a pointer and never changes. So fortunately reading this and checking for - * non-NULL is thread safe in the x86_64 memory model. */ - group = qpair->group; - - if (group == NULL) { - /* The qpair hasn't been assigned to a group yet, so we can't - * process a disconnect. Send a message to ourself and try again. */ - spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_try_disconnect, qpair); + /* nvmr_rdma_close_qpair is not called */ + if (!rqpair->to_close) { return; } - spdk_thread_send_msg(group->thread, _nvmf_rdma_qpair_disconnect, qpair); -} - -static inline void -nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair) -{ - if (!__atomic_test_and_set(&rqpair->disconnect_started, __ATOMIC_RELAXED)) { - _nvmf_rdma_try_disconnect(&rqpair->qpair); - } -} - -static void nvmf_rdma_destroy_drained_qpair(void *ctx) -{ - struct spdk_nvmf_rdma_qpair *rqpair = ctx; - struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, - struct spdk_nvmf_rdma_transport, transport); - /* In non SRQ path, we will reach rqpair->max_queue_depth. In SRQ path, we will get the last_wqe event. */ if (rqpair->current_send_depth != 0) { return; @@ -2788,21 +2761,19 @@ static void nvmf_rdma_destroy_drained_qpair(void *ctx) return; } - if (rqpair->srq != NULL && rqpair->last_wqe_reached == false) { + /* Judge whether the device is emulated by Software RoCE. + * And it will not send last_wqe event + */ + if (rqpair->srq != NULL && rqpair->device->attr.vendor_id != 0 && + rqpair->last_wqe_reached == false) { return; } - nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); - - /* Qpair will be destroyed after nvmf layer closes this qpair */ - if (rqpair->qpair.state != SPDK_NVMF_QPAIR_ERROR) { - return; - } + assert(rqpair->qpair.state == SPDK_NVMF_QPAIR_ERROR); nvmf_rdma_qpair_destroy(rqpair); } - static int nvmf_rdma_disconnect(struct rdma_cm_event *evt) { @@ -2824,7 +2795,7 @@ nvmf_rdma_disconnect(struct rdma_cm_event *evt) spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair->cm_id, 0); - nvmf_rdma_start_disconnect(rqpair); + spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); return 0; } @@ -2862,7 +2833,7 @@ nvmf_rdma_disconnect_qpairs_on_port(struct spdk_nvmf_rdma_transport *rtransport, TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { TAILQ_FOREACH(rqpair, &rpoller->qpairs, link) { if (rqpair->listen_id == port->id) { - nvmf_rdma_start_disconnect(rqpair); + spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); } } } @@ -3022,13 +2993,6 @@ nvmf_process_cm_event(struct spdk_nvmf_transport *transport) } } -static void -nvmf_rdma_handle_qp_fatal(struct spdk_nvmf_rdma_qpair *rqpair) -{ - nvmf_rdma_update_ibv_state(rqpair); - nvmf_rdma_start_disconnect(rqpair); -} - static void nvmf_rdma_handle_last_wqe_reached(struct spdk_nvmf_rdma_qpair *rqpair) { @@ -3036,12 +3000,6 @@ nvmf_rdma_handle_last_wqe_reached(struct spdk_nvmf_rdma_qpair *rqpair) nvmf_rdma_destroy_drained_qpair(rqpair); } -static void -nvmf_rdma_handle_sq_drained(struct spdk_nvmf_rdma_qpair *rqpair) -{ - nvmf_rdma_start_disconnect(rqpair); -} - static void nvmf_rdma_qpair_process_ibv_event(void *ctx) { @@ -3061,25 +3019,39 @@ nvmf_rdma_send_qpair_async_event(struct spdk_nvmf_rdma_qpair *rqpair, spdk_nvmf_rdma_qpair_ibv_event fn) { struct spdk_nvmf_rdma_ibv_event_ctx *ctx; + struct spdk_thread *thr = NULL; + int rc; + + if (rqpair->qpair.group) { + thr = rqpair->qpair.group->thread; + } else if (rqpair->destruct_channel) { + thr = spdk_io_channel_get_thread(rqpair->destruct_channel); + } - if (!rqpair->qpair.group) { - return EINVAL; + if (!thr) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "rqpair %p has no thread\n", rqpair); + return -EINVAL; } ctx = calloc(1, sizeof(*ctx)); if (!ctx) { - return ENOMEM; + return -ENOMEM; } ctx->rqpair = rqpair; ctx->cb_fn = fn; STAILQ_INSERT_TAIL(&rqpair->ibv_events, ctx, link); - return spdk_thread_send_msg(rqpair->qpair.group->thread, nvmf_rdma_qpair_process_ibv_event, - ctx); + rc = spdk_thread_send_msg(thr, nvmf_rdma_qpair_process_ibv_event, ctx); + if (rc) { + STAILQ_REMOVE(&rqpair->ibv_events, ctx, spdk_nvmf_rdma_ibv_event_ctx, link); + free(ctx); + } + + return rc; } -static void +static int nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) { int rc; @@ -3089,9 +3061,8 @@ nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) rc = ibv_get_async_event(device->context, &event); if (rc) { - SPDK_ERRLOG("Failed to get async_event (%d): %s\n", - errno, spdk_strerror(errno)); - return; + /* In non-blocking mode -1 means there are no events available */ + return rc; } switch (event.event_type) { @@ -3100,17 +3071,16 @@ nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) SPDK_ERRLOG("Fatal event received for rqpair %p\n", rqpair); spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, (uintptr_t)rqpair->cm_id, event.event_type); - if (nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_qp_fatal)) { - SPDK_ERRLOG("Failed to send QP_FATAL event for rqpair %p\n", rqpair); - nvmf_rdma_handle_qp_fatal(rqpair); - } + nvmf_rdma_update_ibv_state(rqpair); + spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); break; case IBV_EVENT_QP_LAST_WQE_REACHED: /* This event only occurs for shared receive queues. */ rqpair = event.element.qp->qp_context; SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Last WQE reached event received for rqpair %p\n", rqpair); - if (nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_last_wqe_reached)) { - SPDK_ERRLOG("Failed to send LAST_WQE_REACHED event for rqpair %p\n", rqpair); + rc = nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_last_wqe_reached); + if (rc) { + SPDK_WARNLOG("Failed to send LAST_WQE_REACHED event. rqpair %p, err %d\n", rqpair, rc); rqpair->last_wqe_reached = true; } break; @@ -3122,10 +3092,7 @@ nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, (uintptr_t)rqpair->cm_id, event.event_type); if (nvmf_rdma_update_ibv_state(rqpair) == IBV_QPS_ERR) { - if (nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_sq_drained)) { - SPDK_ERRLOG("Failed to send SQ_DRAINED event for rqpair %p\n", rqpair); - nvmf_rdma_handle_sq_drained(rqpair); - } + spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); } break; case IBV_EVENT_QP_REQ_ERR: @@ -3158,6 +3125,24 @@ nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) break; } ibv_ack_async_event(&event); + + return 0; +} + +static void +nvmf_process_ib_events(struct spdk_nvmf_rdma_device *device, uint32_t max_events) +{ + int rc = 0; + uint32_t i = 0; + + for (i = 0; i < max_events; i++) { + rc = nvmf_process_ib_event(device); + if (rc) { + break; + } + } + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Device %s: %u events processed\n", device->context->device->name, i); } static uint32_t @@ -3188,7 +3173,7 @@ nvmf_rdma_accept(struct spdk_nvmf_transport *transport) /* Second and subsequent poll descriptors are IB async events */ TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { if (rtransport->poll_fds[i++].revents & POLLIN) { - nvmf_process_ib_event(device); + nvmf_process_ib_events(device, 32); nfds--; } } @@ -3444,7 +3429,6 @@ nvmf_rdma_qpair_reject_connection(struct spdk_nvmf_rdma_qpair *rqpair) if (rqpair->cm_id != NULL) { nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); } - nvmf_rdma_qpair_destroy(rqpair); } static int @@ -3495,12 +3479,53 @@ nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group, return 0; } +static int +nvmf_rdma_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + assert(group->transport->tgt != NULL); + + rqpair->destruct_channel = spdk_get_io_channel(group->transport->tgt); + + if (!rqpair->destruct_channel) { + SPDK_WARNLOG("failed to get io_channel, qpair %p\n", qpair); + return 0; + } + + /* Sanity check that we get io_channel on the correct thread */ + if (qpair->group) { + assert(qpair->group->thread == spdk_io_channel_get_thread(rqpair->destruct_channel)); + } + + return 0; +} + static int nvmf_rdma_request_free(struct spdk_nvmf_request *req) { struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, struct spdk_nvmf_rdma_transport, transport); + struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, + struct spdk_nvmf_rdma_qpair, qpair); + + /* + * AER requests are freed when a qpair is destroyed. The recv corresponding to that request + * needs to be returned to the shared receive queue or the poll group will eventually be + * starved of RECV structures. + */ + if (rqpair->srq && rdma_req->recv) { + int rc; + struct ibv_recv_wr *bad_recv_wr; + + rc = ibv_post_srq_recv(rqpair->srq, &rdma_req->recv->wr, &bad_recv_wr); + if (rc) { + SPDK_ERRLOG("Unable to re-post rx descriptor\n"); + } + } _nvmf_rdma_request_free(rdma_req, rtransport); return 0; @@ -3529,32 +3554,12 @@ nvmf_rdma_request_complete(struct spdk_nvmf_request *req) return 0; } -static int -nvmf_rdma_destroy_defunct_qpair(void *ctx) -{ - struct spdk_nvmf_rdma_qpair *rqpair = ctx; - struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, - struct spdk_nvmf_rdma_transport, transport); - - SPDK_INFOLOG(SPDK_LOG_RDMA, "QP#%d hasn't been drained as expected, manually destroy it\n", - rqpair->qpair.qid); - - nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); - nvmf_rdma_qpair_destroy(rqpair); - - return 0; -} - static void nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair) { struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); - if (rqpair->disconnect_flags & RDMA_QP_DISCONNECTING) { - return; - } - - rqpair->disconnect_flags |= RDMA_QP_DISCONNECTING; + rqpair->to_close = true; /* This happens only when the qpair is disconnected before * it is added to the poll group. Since there is no poll group, @@ -3563,15 +3568,15 @@ nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair) */ if (rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED) { nvmf_rdma_qpair_reject_connection(rqpair); + nvmf_rdma_qpair_destroy(rqpair); return; } - if (rqpair->cm_id) { + if (rqpair->rdma_qp) { spdk_rdma_qp_disconnect(rqpair->rdma_qp); } - rqpair->destruct_poller = SPDK_POLLER_REGISTER(nvmf_rdma_destroy_defunct_qpair, (void *)rqpair, - NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US); + nvmf_rdma_destroy_drained_qpair(rqpair); } static struct spdk_nvmf_rdma_qpair * @@ -3612,7 +3617,7 @@ _poller_reset_failed_recvs(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_rec rdma_recv->qpair->current_recv_depth++; bad_recv_wr = bad_recv_wr->next; SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rdma_recv->qpair, -rc); - nvmf_rdma_start_disconnect(rdma_recv->qpair); + spdk_nvmf_qpair_disconnect(&rdma_recv->qpair->qpair, NULL, NULL); } } @@ -3624,7 +3629,7 @@ _qp_reset_failed_recvs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr * bad_recv_wr = bad_recv_wr->next; rqpair->current_recv_depth++; } - nvmf_rdma_start_disconnect(rqpair); + spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); } static void @@ -3715,7 +3720,7 @@ _qp_reset_failed_sends(struct spdk_nvmf_rdma_transport *rtransport, if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) { /* Disconnect the connection. */ - nvmf_rdma_start_disconnect(rqpair); + spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); } } @@ -3740,6 +3745,41 @@ _poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport, } } +static const char * +nvmf_rdma_wr_type_str(enum spdk_nvmf_rdma_wr_type wr_type) +{ + switch (wr_type) { + case RDMA_WR_TYPE_RECV: + return "RECV"; + case RDMA_WR_TYPE_SEND: + return "SEND"; + case RDMA_WR_TYPE_DATA: + return "DATA"; + default: + SPDK_ERRLOG("Unknown WR type %d\n", wr_type); + SPDK_UNREACHABLE(); + } +} + +static inline void +nvmf_rdma_log_wc_status(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_wc *wc) +{ + enum spdk_nvmf_rdma_wr_type wr_type = ((struct spdk_nvmf_rdma_wr *)wc->wr_id)->type; + + if (wc->status == IBV_WC_WR_FLUSH_ERR) { + /* If qpair is in ERR state, we will receive completions for all posted and not completed + * Work Requests with IBV_WC_WR_FLUSH_ERR status. Don't log an error in that case */ + SPDK_DEBUGLOG(SPDK_LOG_RDMA, + "Error on CQ %p, (qp state %d ibv_state %d) request 0x%lu, type %s, status: (%d): %s\n", + rqpair->poller->cq, rqpair->qpair.state, rqpair->ibv_state, wc->wr_id, + nvmf_rdma_wr_type_str(wr_type), wc->status, ibv_wc_status_str(wc->status)); + } else { + SPDK_ERRLOG("Error on CQ %p, (qp state %d ibv_state %d) request 0x%lu, type %s, status: (%d): %s\n", + rqpair->poller->cq, rqpair->qpair.state, rqpair->ibv_state, wc->wr_id, + nvmf_rdma_wr_type_str(wr_type), wc->status, ibv_wc_status_str(wc->status)); + } +} + static int nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, struct spdk_nvmf_rdma_poller *rpoller) @@ -3816,7 +3856,7 @@ nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, if (!wc[i].status) { assert(wc[i].opcode == IBV_WC_RECV); if (rqpair->current_recv_depth >= rqpair->max_queue_depth) { - nvmf_rdma_start_disconnect(rqpair); + spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); break; } } @@ -3863,22 +3903,14 @@ nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, /* Handle error conditions */ if (wc[i].status) { - if ((rdma_wr->type == RDMA_WR_TYPE_RECV && !rpoller->srq)) { - /* When we don't use SRQ and close a qpair, we will receive completions with error - * status for all posted ibv_recv_wrs. This is expected and we don't want to log - * an error in that case. */ - SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Error on CQ %p, request 0x%lu, type %d, status: (%d): %s\n", - rpoller->cq, wc[i].wr_id, rdma_wr->type, wc[i].status, ibv_wc_status_str(wc[i].status)); - } else { - SPDK_ERRLOG("Error on CQ %p, request 0x%lu, type %d, status: (%d): %s\n", - rpoller->cq, wc[i].wr_id, rdma_wr->type, wc[i].status, ibv_wc_status_str(wc[i].status)); - } + nvmf_rdma_update_ibv_state(rqpair); + nvmf_rdma_log_wc_status(rqpair, &wc[i]); error = true; if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) { /* Disconnect the connection. */ - nvmf_rdma_start_disconnect(rqpair); + spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); } else { nvmf_rdma_destroy_drained_qpair(rqpair); } @@ -4016,6 +4048,113 @@ spdk_nvmf_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks) g_nvmf_hooks = *hooks; } +static void +nvmf_rdma_request_set_abort_status(struct spdk_nvmf_request *req, + struct spdk_nvmf_rdma_request *rdma_req_to_abort) +{ + rdma_req_to_abort->req.rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + rdma_req_to_abort->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; + + rdma_req_to_abort->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; + + req->rsp->nvme_cpl.cdw0 &= ~1U; /* Command was successfully aborted. */ +} + +static int +_nvmf_rdma_qpair_abort_request(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_rdma_request *rdma_req_to_abort = SPDK_CONTAINEROF( + req->req_to_abort, struct spdk_nvmf_rdma_request, req); + struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(req->req_to_abort->qpair, + struct spdk_nvmf_rdma_qpair, qpair); + int rc; + + spdk_poller_unregister(&req->poller); + + switch (rdma_req_to_abort->state) { + case RDMA_REQUEST_STATE_EXECUTING: + rc = nvmf_ctrlr_abort_request(req); + if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS) { + return SPDK_POLLER_BUSY; + } + break; + + case RDMA_REQUEST_STATE_NEED_BUFFER: + STAILQ_REMOVE(&rqpair->poller->group->group.pending_buf_queue, + &rdma_req_to_abort->req, spdk_nvmf_request, buf_link); + + nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort); + break; + + case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING: + STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req_to_abort, + spdk_nvmf_rdma_request, state_link); + + nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort); + break; + + case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING: + STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req_to_abort, + spdk_nvmf_rdma_request, state_link); + + nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort); + break; + + case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: + if (spdk_get_ticks() < req->timeout_tsc) { + req->poller = SPDK_POLLER_REGISTER(_nvmf_rdma_qpair_abort_request, req, 0); + return SPDK_POLLER_BUSY; + } + break; + + default: + break; + } + + spdk_nvmf_request_complete(req); + return SPDK_POLLER_BUSY; +} + +static void +nvmf_rdma_qpair_abort_request(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_transport *transport; + uint16_t cid; + uint32_t i; + struct spdk_nvmf_rdma_request *rdma_req_to_abort = NULL; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); + transport = &rtransport->transport; + + cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; + + for (i = 0; i < rqpair->max_queue_depth; i++) { + rdma_req_to_abort = &rqpair->resources->reqs[i]; + + if (rdma_req_to_abort->state != RDMA_REQUEST_STATE_FREE && + rdma_req_to_abort->req.cmd->nvme_cmd.cid == cid) { + break; + } + } + + if (rdma_req_to_abort == NULL) { + spdk_nvmf_request_complete(req); + return; + } + + req->req_to_abort = &rdma_req_to_abort->req; + req->timeout_tsc = spdk_get_ticks() + + transport->opts.abort_timeout_sec * spdk_get_ticks_hz(); + req->poller = NULL; + + _nvmf_rdma_qpair_abort_request(req); +} + static int nvmf_rdma_poll_group_get_stat(struct spdk_nvmf_tgt *tgt, struct spdk_nvmf_transport_poll_group_stat **stat) @@ -4103,6 +4242,7 @@ const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { .get_optimal_poll_group = nvmf_rdma_get_optimal_poll_group, .poll_group_destroy = nvmf_rdma_poll_group_destroy, .poll_group_add = nvmf_rdma_poll_group_add, + .poll_group_remove = nvmf_rdma_poll_group_remove, .poll_group_poll = nvmf_rdma_poll_group_poll, .req_free = nvmf_rdma_request_free, @@ -4112,6 +4252,7 @@ const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { .qpair_get_peer_trid = nvmf_rdma_qpair_get_peer_trid, .qpair_get_local_trid = nvmf_rdma_qpair_get_local_trid, .qpair_get_listen_trid = nvmf_rdma_qpair_get_listen_trid, + .qpair_abort_request = nvmf_rdma_qpair_abort_request, .poll_group_get_stat = nvmf_rdma_poll_group_get_stat, .poll_group_free_stat = nvmf_rdma_poll_group_free_stat, diff --git a/lib/nvmf/spdk_nvmf.map b/lib/nvmf/spdk_nvmf.map index f4a14e68421..b6fc9ca3965 100644 --- a/lib/nvmf/spdk_nvmf.map +++ b/lib/nvmf/spdk_nvmf.map @@ -74,9 +74,11 @@ spdk_nvmf_tgt_add_transport; spdk_nvmf_transport_listen; spdk_nvmf_transport_stop_listen; + spdk_nvmf_transport_stop_listen_async; spdk_nvmf_transport_poll_group_get_stat; spdk_nvmf_transport_poll_group_free_stat; spdk_nvmf_rdma_init_hooks; + spdk_nvmf_subsystem_set_ana_reporting; # public functions in nvmf_cmd.h spdk_nvmf_ctrlr_identify_ctrlr; @@ -90,6 +92,8 @@ spdk_nvmf_request_get_data; spdk_nvmf_request_get_cmd; spdk_nvmf_request_get_response; + spdk_nvmf_request_get_req_to_abort; + spdk_nvmf_bdev_ctrlr_abort_cmd; # public functions in nvmf_transport.h spdk_nvmf_transport_register; @@ -102,10 +106,10 @@ spdk_nvmf_request_get_buffers_multi; spdk_nvmf_request_get_dif_ctx; spdk_nvmf_request_exec; - spdk_nvmf_request_exec_fabrics; spdk_nvmf_request_free; spdk_nvmf_request_complete; spdk_nvmf_ctrlr_get_subsystem; + spdk_nvmf_ctrlr_get_id; spdk_nvmf_req_get_xfer; spdk_nvmf_poll_group_remove; diff --git a/lib/nvmf/subsystem.c b/lib/nvmf/subsystem.c index ebe8d9a8eda..e76d4f27440 100644 --- a/lib/nvmf/subsystem.c +++ b/lib/nvmf/subsystem.c @@ -232,6 +232,8 @@ nvmf_valid_nqn(const char *nqn) return true; } +static void subsystem_state_change_on_pg(struct spdk_io_channel_iter *i); + struct spdk_nvmf_subsystem * spdk_nvmf_subsystem_create(struct spdk_nvmf_tgt *tgt, const char *nqn, @@ -370,6 +372,31 @@ spdk_nvmf_subsystem_destroy(struct spdk_nvmf_subsystem *subsystem) free(subsystem); } + +/* we have to use the typedef in the function declaration to appease astyle. */ +typedef enum spdk_nvmf_subsystem_state spdk_nvmf_subsystem_state_t; + +static spdk_nvmf_subsystem_state_t +nvmf_subsystem_get_intermediate_state(enum spdk_nvmf_subsystem_state current_state, + enum spdk_nvmf_subsystem_state requested_state) +{ + switch (requested_state) { + case SPDK_NVMF_SUBSYSTEM_INACTIVE: + return SPDK_NVMF_SUBSYSTEM_DEACTIVATING; + case SPDK_NVMF_SUBSYSTEM_ACTIVE: + if (current_state == SPDK_NVMF_SUBSYSTEM_PAUSED) { + return SPDK_NVMF_SUBSYSTEM_RESUMING; + } else { + return SPDK_NVMF_SUBSYSTEM_ACTIVATING; + } + case SPDK_NVMF_SUBSYSTEM_PAUSED: + return SPDK_NVMF_SUBSYSTEM_PAUSING; + default: + assert(false); + return SPDK_NVMF_SUBSYSTEM_NUM_STATES; + } +} + static int nvmf_subsystem_set_state(struct spdk_nvmf_subsystem *subsystem, enum spdk_nvmf_subsystem_state state) @@ -417,6 +444,11 @@ nvmf_subsystem_set_state(struct spdk_nvmf_subsystem *subsystem, state == SPDK_NVMF_SUBSYSTEM_DEACTIVATING) { expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING; } + /* This is for the case when resuming the subsystem fails. */ + if (actual_old_state == SPDK_NVMF_SUBSYSTEM_RESUMING && + state == SPDK_NVMF_SUBSYSTEM_PAUSING) { + expected_old_state = SPDK_NVMF_SUBSYSTEM_RESUMING; + } actual_old_state = expected_old_state; __atomic_compare_exchange_n(&subsystem->state, &actual_old_state, state, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); @@ -428,16 +460,37 @@ nvmf_subsystem_set_state(struct spdk_nvmf_subsystem *subsystem, struct subsystem_state_change_ctx { struct spdk_nvmf_subsystem *subsystem; + enum spdk_nvmf_subsystem_state original_state; + enum spdk_nvmf_subsystem_state requested_state; spdk_nvmf_subsystem_state_change_done cb_fn; void *cb_arg; }; +static void +subsystem_state_change_revert_done(struct spdk_io_channel_iter *i, int status) +{ + struct subsystem_state_change_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + /* Nothing to be done here if the state setting fails, we are just screwed. */ + if (nvmf_subsystem_set_state(ctx->subsystem, ctx->requested_state)) { + SPDK_ERRLOG("Unable to revert the subsystem state after operation failure.\n"); + } + + ctx->subsystem->changing_state = false; + if (ctx->cb_fn) { + /* return a failure here. This function only exists in an error path. */ + ctx->cb_fn(ctx->subsystem, ctx->cb_arg, -1); + } + free(ctx); +} + static void subsystem_state_change_done(struct spdk_io_channel_iter *i, int status) { struct subsystem_state_change_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + enum spdk_nvmf_subsystem_state intermediate_state; if (status == 0) { status = nvmf_subsystem_set_state(ctx->subsystem, ctx->requested_state); @@ -446,6 +499,24 @@ subsystem_state_change_done(struct spdk_io_channel_iter *i, int status) } } + if (status) { + intermediate_state = nvmf_subsystem_get_intermediate_state(ctx->requested_state, + ctx->original_state); + assert(intermediate_state != SPDK_NVMF_SUBSYSTEM_NUM_STATES); + + if (nvmf_subsystem_set_state(ctx->subsystem, intermediate_state)) { + goto out; + } + ctx->requested_state = ctx->original_state; + spdk_for_each_channel(ctx->subsystem->tgt, + subsystem_state_change_on_pg, + ctx, + subsystem_state_change_revert_done); + return; + } + +out: + ctx->subsystem->changing_state = false; if (ctx->cb_fn) { ctx->cb_fn(ctx->subsystem, ctx->cb_arg, status); } @@ -500,33 +571,33 @@ nvmf_subsystem_state_change(struct spdk_nvmf_subsystem *subsystem, enum spdk_nvmf_subsystem_state intermediate_state; int rc; - switch (requested_state) { - case SPDK_NVMF_SUBSYSTEM_INACTIVE: - intermediate_state = SPDK_NVMF_SUBSYSTEM_DEACTIVATING; - break; - case SPDK_NVMF_SUBSYSTEM_ACTIVE: - if (subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED) { - intermediate_state = SPDK_NVMF_SUBSYSTEM_RESUMING; - } else { - intermediate_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING; + if (__sync_val_compare_and_swap(&subsystem->changing_state, false, true)) { + return -EBUSY; + } + + /* If we are already in the requested state, just call the callback immediately. */ + if (subsystem->state == requested_state) { + subsystem->changing_state = false; + if (cb_fn) { + cb_fn(subsystem, cb_arg, 0); } - break; - case SPDK_NVMF_SUBSYSTEM_PAUSED: - intermediate_state = SPDK_NVMF_SUBSYSTEM_PAUSING; - break; - default: - assert(false); - return -EINVAL; + return 0; } + intermediate_state = nvmf_subsystem_get_intermediate_state(subsystem->state, requested_state); + assert(intermediate_state != SPDK_NVMF_SUBSYSTEM_NUM_STATES); + ctx = calloc(1, sizeof(*ctx)); if (!ctx) { + subsystem->changing_state = false; return -ENOMEM; } + ctx->original_state = subsystem->state; rc = nvmf_subsystem_set_state(subsystem, intermediate_state); if (rc) { free(ctx); + subsystem->changing_state = false; return rc; } @@ -777,6 +848,7 @@ spdk_nvmf_subsystem_add_listener(struct spdk_nvmf_subsystem *subsystem, struct spdk_nvmf_transport *transport; struct spdk_nvmf_subsystem_listener *listener; struct spdk_nvmf_listener *tr_listener; + int rc = 0; assert(cb_fn != NULL); @@ -817,14 +889,13 @@ spdk_nvmf_subsystem_add_listener(struct spdk_nvmf_subsystem *subsystem, listener->cb_fn = cb_fn; listener->cb_arg = cb_arg; listener->subsystem = subsystem; + listener->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; if (transport->ops->listen_associate != NULL) { - transport->ops->listen_associate(transport, subsystem, trid, - _nvmf_subsystem_add_listener_done, - listener); - } else { - _nvmf_subsystem_add_listener_done(listener, 0); + rc = transport->ops->listen_associate(transport, subsystem, trid); } + + _nvmf_subsystem_add_listener_done(listener, rc); } int @@ -1006,51 +1077,118 @@ spdk_nvmf_subsystem_remove_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t ns return 0; } +struct subsystem_ns_change_ctx { + struct spdk_nvmf_subsystem *subsystem; + spdk_nvmf_subsystem_state_change_done cb_fn; + uint32_t nsid; +}; + static void _nvmf_ns_hot_remove(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status) { - struct spdk_nvmf_ns *ns = cb_arg; + struct subsystem_ns_change_ctx *ctx = cb_arg; int rc; - rc = spdk_nvmf_subsystem_remove_ns(subsystem, ns->opts.nsid); + rc = spdk_nvmf_subsystem_remove_ns(subsystem, ctx->nsid); if (rc != 0) { SPDK_ERRLOG("Failed to make changes to NVME-oF subsystem with id: %u\n", subsystem->id); } spdk_nvmf_subsystem_resume(subsystem, NULL, NULL); + + free(ctx); +} + +static void +nvmf_ns_change_msg(void *ns_ctx) +{ + struct subsystem_ns_change_ctx *ctx = ns_ctx; + int rc; + + rc = spdk_nvmf_subsystem_pause(ctx->subsystem, ctx->cb_fn, ctx); + if (rc) { + if (rc == -EBUSY) { + /* Try again, this is not a permanent situation. */ + spdk_thread_send_msg(spdk_get_thread(), nvmf_ns_change_msg, ctx); + } else { + free(ctx); + SPDK_ERRLOG("Unable to pause subsystem to process namespace removal!\n"); + } + } } static void nvmf_ns_hot_remove(void *remove_ctx) { struct spdk_nvmf_ns *ns = remove_ctx; + struct subsystem_ns_change_ctx *ns_ctx; int rc; - rc = spdk_nvmf_subsystem_pause(ns->subsystem, _nvmf_ns_hot_remove, ns); + /* We have to allocate a new context because this op + * is asynchronous and we could lose the ns in the middle. + */ + ns_ctx = calloc(1, sizeof(struct subsystem_ns_change_ctx)); + if (!ns_ctx) { + SPDK_ERRLOG("Unable to allocate context to process namespace removal!\n"); + return; + } + + ns_ctx->subsystem = ns->subsystem; + ns_ctx->nsid = ns->opts.nsid; + ns_ctx->cb_fn = _nvmf_ns_hot_remove; + + rc = spdk_nvmf_subsystem_pause(ns->subsystem, _nvmf_ns_hot_remove, ns_ctx); if (rc) { - SPDK_ERRLOG("Unable to pause subsystem to process namespace removal!\n"); + if (rc == -EBUSY) { + /* Try again, this is not a permanent situation. */ + spdk_thread_send_msg(spdk_get_thread(), nvmf_ns_change_msg, ns_ctx); + } else { + SPDK_ERRLOG("Unable to pause subsystem to process namespace removal!\n"); + free(ns_ctx); + } } } static void _nvmf_ns_resize(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status) { - struct spdk_nvmf_ns *ns = cb_arg; + struct subsystem_ns_change_ctx *ctx = cb_arg; - nvmf_subsystem_ns_changed(subsystem, ns->opts.nsid); + nvmf_subsystem_ns_changed(subsystem, ctx->nsid); spdk_nvmf_subsystem_resume(subsystem, NULL, NULL); + + free(ctx); } static void nvmf_ns_resize(void *event_ctx) { struct spdk_nvmf_ns *ns = event_ctx; + struct subsystem_ns_change_ctx *ns_ctx; int rc; - rc = spdk_nvmf_subsystem_pause(ns->subsystem, _nvmf_ns_resize, ns); + /* We have to allocate a new context because this op + * is asynchronous and we could lose the ns in the middle. + */ + ns_ctx = calloc(1, sizeof(struct subsystem_ns_change_ctx)); + if (!ns_ctx) { + SPDK_ERRLOG("Unable to allocate context to process namespace removal!\n"); + return; + } + + ns_ctx->subsystem = ns->subsystem; + ns_ctx->nsid = ns->opts.nsid; + ns_ctx->cb_fn = _nvmf_ns_resize; + + rc = spdk_nvmf_subsystem_pause(ns->subsystem, _nvmf_ns_resize, ns_ctx); if (rc) { + if (rc == -EBUSY) { + /* Try again, this is not a permanent situation. */ + spdk_thread_send_msg(spdk_get_thread(), nvmf_ns_change_msg, ns_ctx); + } SPDK_ERRLOG("Unable to pause subsystem to process namespace resize!\n"); + free(ns_ctx); } } @@ -2513,3 +2651,116 @@ nvmf_ns_reservation_request(void *ctx) update_done: _nvmf_ns_reservation_update_done(ctrlr->subsys, (void *)req, 0); } + +int +spdk_nvmf_subsystem_set_ana_reporting(struct spdk_nvmf_subsystem *subsystem, + bool ana_reporting) +{ + if (subsystem->state != SPDK_NVMF_SUBSYSTEM_INACTIVE) { + return -EAGAIN; + } + + subsystem->ana_reporting = ana_reporting; + + return 0; +} + +struct subsystem_listener_update_ctx { + struct spdk_nvmf_subsystem_listener *listener; + + spdk_nvmf_tgt_subsystem_listen_done_fn cb_fn; + void *cb_arg; +}; + +static void +subsystem_listener_update_done(struct spdk_io_channel_iter *i, int status) +{ + struct subsystem_listener_update_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + if (ctx->cb_fn) { + ctx->cb_fn(ctx->cb_arg, status); + } + free(ctx); +} + +static void +subsystem_listener_update_on_pg(struct spdk_io_channel_iter *i) +{ + struct subsystem_listener_update_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_nvmf_subsystem_listener *listener; + struct spdk_nvmf_poll_group *group; + struct spdk_nvmf_ctrlr *ctrlr; + + listener = ctx->listener; + group = spdk_io_channel_get_ctx(spdk_io_channel_iter_get_channel(i)); + + TAILQ_FOREACH(ctrlr, &listener->subsystem->ctrlrs, link) { + if (ctrlr->admin_qpair->group == group && ctrlr->listener == listener) { + nvmf_ctrlr_async_event_ana_change_notice(ctrlr); + } + } + + spdk_for_each_channel_continue(i, 0); +} + +void +nvmf_subsystem_set_ana_state(struct spdk_nvmf_subsystem *subsystem, + const struct spdk_nvme_transport_id *trid, + enum spdk_nvme_ana_state ana_state, + spdk_nvmf_tgt_subsystem_listen_done_fn cb_fn, void *cb_arg) +{ + struct spdk_nvmf_subsystem_listener *listener; + struct subsystem_listener_update_ctx *ctx; + + assert(cb_fn != NULL); + assert(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED); + + if (!subsystem->ana_reporting) { + SPDK_ERRLOG("ANA reporting is disabled\n"); + cb_fn(cb_arg, -EINVAL); + return; + } + + /* ANA Change state is not used, ANA Persistent Loss state + * is not supported yet. + */ + if (!(ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE || + ana_state == SPDK_NVME_ANA_NON_OPTIMIZED_STATE || + ana_state == SPDK_NVME_ANA_INACCESSIBLE_STATE)) { + SPDK_ERRLOG("ANA state %d is not supported\n", ana_state); + cb_fn(cb_arg, -ENOTSUP); + return; + } + + listener = nvmf_subsystem_find_listener(subsystem, trid); + if (!listener) { + SPDK_ERRLOG("Unable to find listener.\n"); + cb_fn(cb_arg, -EINVAL); + return; + } + + if (listener->ana_state == ana_state) { + cb_fn(cb_arg, 0); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + SPDK_ERRLOG("Unable to allocate context\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + + listener->ana_state = ana_state; + listener->ana_state_change_count++; + + ctx->listener = listener; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_for_each_channel(subsystem->tgt, + subsystem_listener_update_on_pg, + ctx, + subsystem_listener_update_done); +} diff --git a/lib/nvmf/tcp.c b/lib/nvmf/tcp.c index 12be15704f8..7b4b98a54ba 100644 --- a/lib/nvmf/tcp.c +++ b/lib/nvmf/tcp.c @@ -46,9 +46,10 @@ #include "spdk_internal/log.h" #include "spdk_internal/nvme_tcp.h" +#include "nvmf_internal.h" + #define NVMF_TCP_MAX_ACCEPT_SOCK_ONE_TIME 16 #define SPDK_NVMF_TCP_DEFAULT_MAX_SOCK_PRIORITY 6 -#define SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR 4 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_tcp; @@ -214,7 +215,7 @@ struct spdk_nvmf_tcp_qpair { /* This is a spare PDU used for sending special management * operations. Primarily, this is used for the initial * connection response and c2h termination request. */ - struct nvme_tcp_pdu mgmt_pdu; + struct nvme_tcp_pdu *mgmt_pdu; TAILQ_HEAD(, nvme_tcp_pdu) send_queue; @@ -472,7 +473,8 @@ nvmf_tcp_create(struct spdk_nvmf_transport_opts *opts) " max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" " in_capsule_data_size=%d, max_aq_depth=%d\n" " num_shared_buffers=%d, c2h_success=%d,\n" - " dif_insert_or_strip=%d, sock_priority=%d\n", + " dif_insert_or_strip=%d, sock_priority=%d\n" + " abort_timeout_sec=%d\n", opts->max_queue_depth, opts->max_io_size, opts->max_qpairs_per_ctrlr - 1, @@ -482,7 +484,8 @@ nvmf_tcp_create(struct spdk_nvmf_transport_opts *opts) opts->num_shared_buffers, opts->c2h_success, opts->dif_insert_or_strip, - opts->sock_priority); + opts->sock_priority, + opts->abort_timeout_sec); if (opts->sock_priority > SPDK_NVMF_TCP_DEFAULT_MAX_SOCK_PRIORITY) { SPDK_ERRLOG("Unsupported socket_priority=%d, the current range is: 0 to %d\n" @@ -504,7 +507,7 @@ nvmf_tcp_create(struct spdk_nvmf_transport_opts *opts) return NULL; } - min_shared_buffers = spdk_thread_get_count() * opts->buf_cache_size; + min_shared_buffers = spdk_env_get_core_count() * opts->buf_cache_size; if (min_shared_buffers > opts->num_shared_buffers) { SPDK_ERRLOG("There are not enough buffers to satisfy" "per-poll group caches for each thread. (%" PRIu32 ")" @@ -776,8 +779,6 @@ nvmf_tcp_qpair_init_mem_resource(struct spdk_nvmf_tcp_qpair *tqpair) tqpair->resource_count = opts->max_queue_depth; - tqpair->mgmt_pdu.qpair = tqpair; - tqpair->reqs = calloc(tqpair->resource_count, sizeof(*tqpair->reqs)); if (!tqpair->reqs) { SPDK_ERRLOG("Unable to allocate reqs on tqpair=%p\n", tqpair); @@ -794,7 +795,8 @@ nvmf_tcp_qpair_init_mem_resource(struct spdk_nvmf_tcp_qpair *tqpair) } } - tqpair->pdus = spdk_dma_malloc(tqpair->resource_count * sizeof(*tqpair->pdus), 0x1000, NULL); + /* Add addtional one member, which will be used for mgmt_pdu owned by the tqpair */ + tqpair->pdus = spdk_dma_malloc((tqpair->resource_count + 1) * sizeof(*tqpair->pdus), 0x1000, NULL); if (!tqpair->pdus) { SPDK_ERRLOG("Unable to allocate pdu pool on tqpair =%p.\n", tqpair); return -1; @@ -824,6 +826,9 @@ nvmf_tcp_qpair_init_mem_resource(struct spdk_nvmf_tcp_qpair *tqpair) tqpair->state_cntr[TCP_REQUEST_STATE_FREE]++; } + tqpair->mgmt_pdu = &tqpair->pdus[i]; + tqpair->mgmt_pdu->qpair = tqpair; + tqpair->recv_buf_size = (in_capsule_data_size + sizeof(struct spdk_nvme_tcp_cmd) + 2 * SPDK_NVME_TCP_DIGEST_LEN) * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR; @@ -890,6 +895,7 @@ nvmf_tcp_handle_connect(struct spdk_nvmf_transport *transport, tqpair->state_cntr[TCP_REQUEST_STATE_FREE] = 0; tqpair->port = port; tqpair->qpair.transport = transport; + tqpair->qpair.trid = port->trid; rc = spdk_sock_getaddr(tqpair->sock, tqpair->target_addr, sizeof(tqpair->target_addr), &tqpair->target_port, @@ -1056,7 +1062,7 @@ nvmf_tcp_qpair_handle_timeout(void *ctx) SPDK_NVME_TCP_QPAIR_EXIT_TIMEOUT); nvmf_tcp_qpair_disconnect(tqpair); - return 0; + return SPDK_POLLER_BUSY; } static void @@ -1079,7 +1085,7 @@ nvmf_tcp_send_c2h_term_req(struct spdk_nvmf_tcp_qpair *tqpair, struct nvme_tcp_p uint32_t c2h_term_req_hdr_len = sizeof(*c2h_term_req); uint32_t copy_len; - rsp_pdu = &tqpair->mgmt_pdu; + rsp_pdu = tqpair->mgmt_pdu; c2h_term_req = &rsp_pdu->hdr.term_req; c2h_term_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ; @@ -1518,7 +1524,7 @@ nvmf_tcp_icreq_handle(struct spdk_nvmf_tcp_transport *ttransport, tqpair->cpda = spdk_min(ic_req->hpda, SPDK_NVME_TCP_CPDA_MAX); SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "cpda of tqpair=(%p) is : %u\n", tqpair, tqpair->cpda); - rsp_pdu = &tqpair->mgmt_pdu; + rsp_pdu = tqpair->mgmt_pdu; ic_resp = &rsp_pdu->hdr.ic_resp; ic_resp->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_IC_RESP; @@ -1785,7 +1791,7 @@ nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair) rc = nvme_tcp_read_payload_data(tqpair->sock, pdu); if (rc < 0) { - return NVME_TCP_PDU_IN_PROGRESS; + return NVME_TCP_PDU_FATAL; } pdu->readv_offset += rc; @@ -2084,6 +2090,14 @@ nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport, group = &tqpair->group->group; assert(tcp_req->state != TCP_REQUEST_STATE_FREE); + /* If the qpair is not active, we need to abort the outstanding requests. */ + if (tqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { + if (tcp_req->state == TCP_REQUEST_STATE_NEED_BUFFER) { + STAILQ_REMOVE(&group->pending_buf_queue, &tcp_req->req, spdk_nvmf_request, buf_link); + } + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_COMPLETED); + } + /* The loop here is to allow for several back-to-back state changes. */ do { prev_state = tcp_req->state; @@ -2110,6 +2124,14 @@ nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport, /* The next state transition depends on the data transfer needs of this request. */ tcp_req->req.xfer = spdk_nvmf_req_get_xfer(&tcp_req->req); + if (spdk_unlikely(tcp_req->req.xfer == SPDK_NVME_DATA_BIDIRECTIONAL)) { + tcp_req->req.rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + tcp_req->req.rsp->nvme_cpl.status.sct = SPDK_NVME_SC_INVALID_OPCODE; + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_COMPLETE); + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Request %p: invalid xfer type (BIDIRECTIONAL)\n", tcp_req); + break; + } + /* If no data to transfer, ready to execute. */ if (tcp_req->req.xfer == SPDK_NVME_DATA_NONE) { /* Reset the tqpair receving pdu state */ @@ -2459,6 +2481,103 @@ nvmf_tcp_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, return nvmf_tcp_qpair_get_trid(qpair, trid, 0); } +static void +nvmf_tcp_req_set_abort_status(struct spdk_nvmf_request *req, + struct spdk_nvmf_tcp_req *tcp_req_to_abort) +{ + tcp_req_to_abort->req.rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + tcp_req_to_abort->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; + + nvmf_tcp_req_set_state(tcp_req_to_abort, TCP_REQUEST_STATE_READY_TO_COMPLETE); + + req->rsp->nvme_cpl.cdw0 &= ~1U; /* Command was successfully aborted. */ +} + +static int +_nvmf_tcp_qpair_abort_request(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_tcp_req *tcp_req_to_abort = SPDK_CONTAINEROF(req->req_to_abort, + struct spdk_nvmf_tcp_req, req); + struct spdk_nvmf_tcp_qpair *tqpair = SPDK_CONTAINEROF(req->req_to_abort->qpair, + struct spdk_nvmf_tcp_qpair, qpair); + int rc; + + spdk_poller_unregister(&req->poller); + + switch (tcp_req_to_abort->state) { + case TCP_REQUEST_STATE_EXECUTING: + rc = nvmf_ctrlr_abort_request(req); + if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS) { + return SPDK_POLLER_BUSY; + } + break; + + case TCP_REQUEST_STATE_NEED_BUFFER: + STAILQ_REMOVE(&tqpair->group->group.pending_buf_queue, + &tcp_req_to_abort->req, spdk_nvmf_request, buf_link); + + nvmf_tcp_req_set_abort_status(req, tcp_req_to_abort); + break; + + case TCP_REQUEST_STATE_AWAITING_R2T_ACK: + nvmf_tcp_req_set_abort_status(req, tcp_req_to_abort); + break; + + case TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: + if (spdk_get_ticks() < req->timeout_tsc) { + req->poller = SPDK_POLLER_REGISTER(_nvmf_tcp_qpair_abort_request, req, 0); + return SPDK_POLLER_BUSY; + } + break; + + default: + break; + } + + spdk_nvmf_request_complete(req); + return SPDK_POLLER_BUSY; +} + +static void +nvmf_tcp_qpair_abort_request(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_tcp_qpair *tqpair; + struct spdk_nvmf_tcp_transport *ttransport; + struct spdk_nvmf_transport *transport; + uint16_t cid; + uint32_t i; + struct spdk_nvmf_tcp_req *tcp_req_to_abort = NULL; + + tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair); + ttransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_tcp_transport, transport); + transport = &ttransport->transport; + + cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; + + for (i = 0; i < tqpair->resource_count; i++) { + tcp_req_to_abort = &tqpair->reqs[i]; + + if (tcp_req_to_abort->state != TCP_REQUEST_STATE_FREE && + tcp_req_to_abort->req.cmd->nvme_cmd.cid == cid) { + break; + } + } + + if (tcp_req_to_abort == NULL) { + spdk_nvmf_request_complete(req); + return; + } + + req->req_to_abort = &tcp_req_to_abort->req; + req->timeout_tsc = spdk_get_ticks() + + transport->opts.abort_timeout_sec * spdk_get_ticks_hz(); + req->poller = NULL; + + _nvmf_tcp_qpair_abort_request(req); +} + #define SPDK_NVMF_TCP_DEFAULT_MAX_QUEUE_DEPTH 128 #define SPDK_NVMF_TCP_DEFAULT_AQ_DEPTH 128 #define SPDK_NVMF_TCP_DEFAULT_MAX_QPAIRS_PER_CTRLR 128 @@ -2470,6 +2589,7 @@ nvmf_tcp_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, #define SPDK_NVMF_TCP_DEFAULT_SUCCESS_OPTIMIZATION true #define SPDK_NVMF_TCP_DEFAULT_DIF_INSERT_OR_STRIP false #define SPDK_NVMF_TCP_DEFAULT_SOCK_PRIORITY 0 +#define SPDK_NVMF_TCP_DEFAULT_ABORT_TIMEOUT_SEC 1 static void nvmf_tcp_opts_init(struct spdk_nvmf_transport_opts *opts) @@ -2485,6 +2605,7 @@ nvmf_tcp_opts_init(struct spdk_nvmf_transport_opts *opts) opts->c2h_success = SPDK_NVMF_TCP_DEFAULT_SUCCESS_OPTIMIZATION; opts->dif_insert_or_strip = SPDK_NVMF_TCP_DEFAULT_DIF_INSERT_OR_STRIP; opts->sock_priority = SPDK_NVMF_TCP_DEFAULT_SOCK_PRIORITY; + opts->abort_timeout_sec = SPDK_NVMF_TCP_DEFAULT_ABORT_TIMEOUT_SEC; } const struct spdk_nvmf_transport_ops spdk_nvmf_transport_tcp = { @@ -2514,6 +2635,7 @@ const struct spdk_nvmf_transport_ops spdk_nvmf_transport_tcp = { .qpair_get_local_trid = nvmf_tcp_qpair_get_local_trid, .qpair_get_peer_trid = nvmf_tcp_qpair_get_peer_trid, .qpair_get_listen_trid = nvmf_tcp_qpair_get_listen_trid, + .qpair_abort_request = nvmf_tcp_qpair_abort_request, }; SPDK_NVMF_TRANSPORT_REGISTER(tcp, &spdk_nvmf_transport_tcp); diff --git a/lib/nvmf/transport.c b/lib/nvmf/transport.c index 92a87717f27..5c2e2b64aa4 100644 --- a/lib/nvmf/transport.c +++ b/lib/nvmf/transport.c @@ -44,6 +44,7 @@ #include "spdk/util.h" #define MAX_MEMPOOL_NAME_LENGTH 40 +#define NVMF_TRANSPORT_DEFAULT_ASSOCIATION_TIMEOUT_IN_MS 120000 struct nvmf_transport_ops_list_element { struct spdk_nvmf_transport_ops ops; @@ -254,6 +255,85 @@ spdk_nvmf_transport_stop_listen(struct spdk_nvmf_transport *transport, return 0; } +struct nvmf_stop_listen_ctx { + struct spdk_nvmf_transport *transport; + struct spdk_nvme_transport_id trid; + spdk_nvmf_tgt_subsystem_listen_done_fn cb_fn; + void *cb_arg; +}; + +static void +nvmf_stop_listen_fini(struct spdk_io_channel_iter *i, int status) +{ + struct nvmf_stop_listen_ctx *ctx; + struct spdk_nvmf_transport *transport; + int rc = status; + + ctx = spdk_io_channel_iter_get_ctx(i); + transport = ctx->transport; + assert(transport != NULL); + + rc = spdk_nvmf_transport_stop_listen(transport, &ctx->trid); + if (rc) { + SPDK_ERRLOG("Failed to stop listening on address '%s'\n", ctx->trid.traddr); + } + + if (ctx->cb_fn) { + ctx->cb_fn(ctx->cb_arg, rc); + } + free(ctx); +} + +static void +nvmf_stop_listen_disconnect_qpairs(struct spdk_io_channel_iter *i) +{ + struct nvmf_stop_listen_ctx *ctx; + struct spdk_nvmf_poll_group *group; + struct spdk_io_channel *ch; + struct spdk_nvmf_qpair *qpair, *tmp_qpair; + struct spdk_nvme_transport_id tmp_trid; + + ctx = spdk_io_channel_iter_get_ctx(i); + ch = spdk_io_channel_iter_get_channel(i); + group = spdk_io_channel_get_ctx(ch); + + TAILQ_FOREACH_SAFE(qpair, &group->qpairs, link, tmp_qpair) { + /* skip qpairs that don't match the TRID. */ + if (spdk_nvmf_qpair_get_listen_trid(qpair, &tmp_trid)) { + continue; + } + + if (!spdk_nvme_transport_id_compare(&ctx->trid, &tmp_trid)) { + spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); + } + } + spdk_for_each_channel_continue(i, 0); +} + +int +spdk_nvmf_transport_stop_listen_async(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid, + spdk_nvmf_tgt_subsystem_listen_done_fn cb_fn, + void *cb_arg) +{ + struct nvmf_stop_listen_ctx *ctx; + + ctx = calloc(1, sizeof(struct nvmf_stop_listen_ctx)); + if (ctx == NULL) { + return -ENOMEM; + } + + ctx->trid = *trid; + ctx->transport = transport; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_for_each_channel(transport->tgt, nvmf_stop_listen_disconnect_qpairs, ctx, + nvmf_stop_listen_fini); + + return 0; +} + uint32_t nvmf_transport_accept(struct spdk_nvmf_transport *transport) { @@ -272,7 +352,8 @@ struct spdk_nvmf_transport_poll_group * nvmf_transport_poll_group_create(struct spdk_nvmf_transport *transport) { struct spdk_nvmf_transport_poll_group *group; - struct spdk_nvmf_transport_pg_cache_buf *buf; + struct spdk_nvmf_transport_pg_cache_buf **bufs; + uint32_t i; group = transport->ops->poll_group_create(transport); if (!group) { @@ -284,17 +365,34 @@ nvmf_transport_poll_group_create(struct spdk_nvmf_transport *transport) STAILQ_INIT(&group->buf_cache); if (transport->opts.buf_cache_size) { - group->buf_cache_count = 0; group->buf_cache_size = transport->opts.buf_cache_size; - while (group->buf_cache_count < group->buf_cache_size) { - buf = (struct spdk_nvmf_transport_pg_cache_buf *)spdk_mempool_get(transport->data_buf_pool); - if (!buf) { - SPDK_NOTICELOG("Unable to reserve the full number of buffers for the pg buffer cache.\n"); - break; + bufs = calloc(group->buf_cache_size, sizeof(struct spdk_nvmf_transport_pg_cache_buf *)); + + if (!bufs) { + SPDK_ERRLOG("Memory allocation failed, can't reserve buffers for the pg buffer cache\n"); + return group; + } + + if (spdk_mempool_get_bulk(transport->data_buf_pool, (void **)bufs, group->buf_cache_size)) { + group->buf_cache_size = (uint32_t)spdk_mempool_count(transport->data_buf_pool); + SPDK_NOTICELOG("Unable to reserve the full number of buffers for the pg buffer cache. " + "Decrease the number of cached buffers from %u to %u\n", + transport->opts.buf_cache_size, group->buf_cache_size); + /* Sanity check */ + assert(group->buf_cache_size <= transport->opts.buf_cache_size); + /* Try again with less number of buffers */ + if (spdk_mempool_get_bulk(transport->data_buf_pool, (void **)bufs, group->buf_cache_size)) { + SPDK_NOTICELOG("Failed to reserve %u buffers\n", group->buf_cache_size); + group->buf_cache_size = 0; } - STAILQ_INSERT_HEAD(&group->buf_cache, buf, link); - group->buf_cache_count++; } + + for (i = 0; i < group->buf_cache_size; i++) { + STAILQ_INSERT_HEAD(&group->buf_cache, bufs[i], link); + } + group->buf_cache_count = group->buf_cache_size; + + free(bufs); } return group; } @@ -401,6 +499,13 @@ nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, return qpair->transport->ops->qpair_get_listen_trid(qpair, trid); } +void +nvmf_transport_qpair_abort_request(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_request *req) +{ + qpair->transport->ops->qpair_abort_request(qpair, req); +} + bool spdk_nvmf_transport_opts_init(const char *transport_name, struct spdk_nvmf_transport_opts *opts) @@ -413,6 +518,7 @@ spdk_nvmf_transport_opts_init(const char *transport_name, return false; } + opts->association_timeout = NVMF_TRANSPORT_DEFAULT_ASSOCIATION_TIMEOUT_IN_MS; ops->opts_init(opts); return true; } diff --git a/lib/nvmf/transport.h b/lib/nvmf/transport.h index 5bf6847a898..38b5d8db371 100644 --- a/lib/nvmf/transport.h +++ b/lib/nvmf/transport.h @@ -76,4 +76,7 @@ int nvmf_transport_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, int nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, struct spdk_nvme_transport_id *trid); +void nvmf_transport_qpair_abort_request(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_request *req); + #endif /* SPDK_NVMF_TRANSPORT_H */ diff --git a/lib/rocksdb/env_spdk.cc b/lib/rocksdb/env_spdk.cc index 8695acca625..ce8bed9235c 100644 --- a/lib/rocksdb/env_spdk.cc +++ b/lib/rocksdb/env_spdk.cc @@ -416,7 +416,7 @@ class SpdkEnv : public EnvWrapper virtual ~SpdkEnv(); virtual Status NewSequentialFile(const std::string &fname, - unique_ptr *result, + std::unique_ptr *result, const EnvOptions &options) override { if (fname.compare(0, mDirectory.length(), mDirectory) == 0) { @@ -444,7 +444,7 @@ class SpdkEnv : public EnvWrapper } virtual Status NewRandomAccessFile(const std::string &fname, - unique_ptr *result, + std::unique_ptr *result, const EnvOptions &options) override { if (fname.compare(0, mDirectory.length(), mDirectory) == 0) { @@ -468,7 +468,7 @@ class SpdkEnv : public EnvWrapper } virtual Status NewWritableFile(const std::string &fname, - unique_ptr *result, + std::unique_ptr *result, const EnvOptions &options) override { if (fname.compare(0, mDirectory.length(), mDirectory) == 0) { @@ -493,14 +493,14 @@ class SpdkEnv : public EnvWrapper virtual Status ReuseWritableFile(const std::string &fname, const std::string &old_fname, - unique_ptr *result, + std::unique_ptr *result, const EnvOptions &options) override { return EnvWrapper::ReuseWritableFile(fname, old_fname, result, options); } virtual Status NewDirectory(__attribute__((unused)) const std::string &name, - unique_ptr *result) override + std::unique_ptr *result) override { result->reset(new SpdkDirectory()); return Status::OK(); @@ -671,6 +671,7 @@ rocksdb_run(__attribute__((unused)) void *arg1) if (bdev == NULL) { SPDK_ERRLOG("bdev %s not found\n", g_bdev_name.c_str()); + spdk_app_stop(0); exit(1); } diff --git a/lib/rocksdb/spdk.rocksdb.mk b/lib/rocksdb/spdk.rocksdb.mk index c026a602ec7..c55017303b0 100644 --- a/lib/rocksdb/spdk.rocksdb.mk +++ b/lib/rocksdb/spdk.rocksdb.mk @@ -41,7 +41,7 @@ CXXFLAGS += -I$(SPDK_DIR)/include -Iinclude/ # The SPDK makefiles turn this on, but RocksDB won't compile with it. So # turn it off after including the SPDK makefiles. -CXXFLAGS += -Wno-missing-declarations +CXXFLAGS += -Wno-missing-declarations -Wno-maybe-uninitialized # The SPDK Makefiles may turn these options on but we do not want to enable # them for the RocksDB source files. @@ -54,8 +54,9 @@ CXXFLAGS += -fno-sanitize=address endif SPDK_LIB_LIST = $(ALL_MODULES_LIST) -SPDK_LIB_LIST += event_bdev event_accel event_vmd +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) SPDK_LIB_LIST += bdev accel event util conf trace log jsonrpc json rpc sock thread notify +SPDK_LIB_LIST += bdev_rpc blobfs_bdev AM_LINK += $(SPDK_LIB_LINKER_ARGS) $(ENV_LINKER_ARGS) AM_LINK += $(SYS_LIBS) diff --git a/lib/rpc/rpc.c b/lib/rpc/rpc.c index 7182f41e968..9662b887dcf 100644 --- a/lib/rpc/rpc.c +++ b/lib/rpc/rpc.c @@ -127,10 +127,21 @@ jsonrpc_handler(struct spdk_jsonrpc_request *request, if ((m->state_mask & g_rpc_state) == g_rpc_state) { m->func(request, params); } else { - spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_STATE, - "Method is allowed in any state in the mask (%"PRIx32")," - " but current state is (%"PRIx32")", - m->state_mask, g_rpc_state); + if (g_rpc_state == SPDK_RPC_STARTUP) { + spdk_jsonrpc_send_error_response_fmt(request, + SPDK_JSONRPC_ERROR_INVALID_STATE, + "Method may only be called after " + "framework is initialized " + "using framework_start_init RPC."); + } else { + spdk_jsonrpc_send_error_response_fmt(request, + SPDK_JSONRPC_ERROR_INVALID_STATE, + "Method may only be called before " + "framework is initialized. " + "Use --wait-for-rpc command line " + "parameter and then issue this RPC " + "before the framework_start_init RPC."); + } } } diff --git a/lib/scsi/lun.c b/lib/scsi/lun.c index 54e75f24d63..262137d8089 100644 --- a/lib/scsi/lun.c +++ b/lib/scsi/lun.c @@ -95,12 +95,12 @@ scsi_lun_reset_check_outstanding_tasks(void *arg) struct spdk_scsi_lun *lun = task->lun; if (scsi_lun_has_outstanding_tasks(lun)) { - return 0; + return SPDK_POLLER_BUSY; } spdk_poller_unregister(&lun->reset_poller); scsi_lun_complete_mgmt_task(lun, task); - return 1; + return SPDK_POLLER_BUSY; } void @@ -197,7 +197,11 @@ _scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) TAILQ_INSERT_TAIL(&lun->tasks, task, scsi_link); if (!lun->removed) { /* Check the command is allowed or not when reservation is exist */ - rc = scsi_pr_check(task); + if (spdk_unlikely(lun->reservation.flags & SCSI_SPC2_RESERVE)) { + rc = scsi2_reserve_check(task); + } else { + rc = scsi_pr_check(task); + } if (spdk_unlikely(rc < 0)) { /* Reservation Conflict */ rc = SPDK_SCSI_TASK_COMPLETE; @@ -295,12 +299,12 @@ scsi_lun_check_io_channel(void *arg) struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg; if (lun->io_channel) { - return -1; + return SPDK_POLLER_BUSY; } spdk_poller_unregister(&lun->hotremove_poller); scsi_lun_remove(lun); - return -1; + return SPDK_POLLER_BUSY; } static void @@ -335,12 +339,12 @@ scsi_lun_check_outstanding_tasks(void *arg) if (scsi_lun_has_outstanding_tasks(lun) || scsi_lun_has_outstanding_mgmt_tasks(lun)) { - return -1; + return SPDK_POLLER_BUSY; } spdk_poller_unregister(&lun->hotremove_poller); scsi_lun_notify_hot_remove(lun); - return -1; + return SPDK_POLLER_BUSY; } static void diff --git a/lib/scsi/scsi_bdev.c b/lib/scsi/scsi_bdev.c index b8affcf247d..bf0fb5af74c 100644 --- a/lib/scsi/scsi_bdev.c +++ b/lib/scsi/scsi_bdev.c @@ -1712,7 +1712,6 @@ bdev_scsi_process_primary(struct spdk_scsi_task *task) int dbd, pc, page, subpage; int cmd_parsed = 0; - switch (cdb[0]) { case SPDK_SPC_INQUIRY: alloc_len = from_be16(&cdb[3]); @@ -1931,6 +1930,22 @@ bdev_scsi_process_primary(struct spdk_scsi_task *task) rc = scsi_pr_in(task, cdb, data, data_len); break; + case SPDK_SPC2_RESERVE_6: + case SPDK_SPC2_RESERVE_10: + rc = scsi2_reserve(task, cdb); + if (rc == 0) { + if (cdb[0] == SPDK_SPC2_RESERVE_10) { + rc = from_be16(&cdb[7]); + } + data_len = 0; + } + break; + + case SPDK_SPC2_RELEASE_6: + case SPDK_SPC2_RELEASE_10: + rc = scsi2_release(task); + break; + default: return SPDK_SCSI_TASK_UNKNOWN; } diff --git a/lib/scsi/scsi_internal.h b/lib/scsi/scsi_internal.h index bf87bd41e1d..2da3a99a855 100644 --- a/lib/scsi/scsi_internal.h +++ b/lib/scsi/scsi_internal.h @@ -73,8 +73,11 @@ struct spdk_scsi_pr_registrant { TAILQ_ENTRY(spdk_scsi_pr_registrant) link; }; +#define SCSI_SPC2_RESERVE 0x00000001U + /* Reservation with LU_SCOPE */ struct spdk_scsi_pr_reservation { + uint32_t flags; struct spdk_scsi_pr_registrant *holder; enum spdk_scsi_pr_type_code rtype; uint64_t crkey; @@ -144,6 +147,8 @@ struct spdk_scsi_lun { uint32_t pr_generation; /** Reservation for the LUN */ struct spdk_scsi_pr_reservation reservation; + /** Reservation holder for SPC2 RESERVE(6) and RESERVE(10) */ + struct spdk_scsi_pr_registrant scsi2_holder; /** List of open descriptors for this LUN. */ TAILQ_HEAD(, spdk_scsi_lun_desc) open_descs; @@ -196,6 +201,10 @@ int scsi_pr_out(struct spdk_scsi_task *task, uint8_t *cdb, uint8_t *data, uint16 int scsi_pr_in(struct spdk_scsi_task *task, uint8_t *cdb, uint8_t *data, uint16_t data_len); int scsi_pr_check(struct spdk_scsi_task *task); +int scsi2_reserve(struct spdk_scsi_task *task, uint8_t *cdb); +int scsi2_release(struct spdk_scsi_task *task); +int scsi2_reserve_check(struct spdk_scsi_task *task); + struct spdk_scsi_globals { pthread_mutex_t mutex; }; diff --git a/lib/scsi/scsi_pr.c b/lib/scsi/scsi_pr.c index 8bbc132c554..4e17cc2c67b 100644 --- a/lib/scsi/scsi_pr.c +++ b/lib/scsi/scsi_pr.c @@ -53,6 +53,23 @@ scsi_pr_get_registrant(struct spdk_scsi_lun *lun, return NULL; } +static bool +scsi2_it_nexus_is_holder(struct spdk_scsi_lun *lun, + struct spdk_scsi_port *initiator_port, + struct spdk_scsi_port *target_port) +{ + struct spdk_scsi_pr_registrant *reg = lun->reservation.holder; + + assert(reg != NULL); + + if ((reg->initiator_port == initiator_port) && + (reg->target_port == target_port)) { + return true; + } + + return false; +} + /* Reservation type is all registrants or not */ static inline bool scsi_pr_is_all_registrants_type(struct spdk_scsi_lun *lun) @@ -638,8 +655,9 @@ scsi_pr_in_report_capabilities(struct spdk_scsi_task *task, param = (struct spdk_scsi_pr_in_report_capabilities_data *)data; memset(param, 0, sizeof(*param)); - /* TODO: can support more capabilities bits */ to_be16(¶m->length, sizeof(*param)); + /* Compatible reservation handling to support RESERVE/RELEASE defined in SPC-2 */ + param->crh = 1; param->tmv = 1; param->wr_ex = 1; param->ex_ac = 1; @@ -775,6 +793,12 @@ scsi_pr_check(struct spdk_scsi_task *task) case SPDK_SBC_READ_CAPACITY_10: case SPDK_SPC_PERSISTENT_RESERVE_IN: case SPDK_SPC_SERVICE_ACTION_IN_16: + /* CRH enabled, processed by scsi2_reserve() */ + case SPDK_SPC2_RESERVE_6: + case SPDK_SPC2_RESERVE_10: + /* CRH enabled, processed by scsi2_release() */ + case SPDK_SPC2_RELEASE_6: + case SPDK_SPC2_RELEASE_10: return 0; case SPDK_SPC_MODE_SELECT_6: case SPDK_SPC_MODE_SELECT_10: @@ -878,3 +902,166 @@ scsi_pr_check(struct spdk_scsi_task *task) SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); return -1; } + +static int +scsi2_check_reservation_conflict(struct spdk_scsi_task *task) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_scsi_pr_registrant *reg; + bool conflict = false; + + reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port); + if (reg) { + /* + * From spc4r31 5.9.3 Exceptions to SPC-2 RESERVE and RELEASE + * behavior + * + * A RESERVE(6) or RESERVE(10) command shall complete with GOOD + * status, but no reservation shall be established and the + * persistent reservation shall not be changed, if the command + * is received from a) and b) below. + * + * A RELEASE(6) or RELEASE(10) command shall complete with GOOD + * status, but the persistent reservation shall not be released, + * if the command is received from a) and b) + * + * a) An I_T nexus that is a persistent reservation holder; or + * b) An I_T nexus that is registered if a registrants only or + * all registrants type persistent reservation is present. + * + * In all other cases, a RESERVE(6) command, RESERVE(10) command, + * RELEASE(6) command, or RELEASE(10) command shall be processed + * as defined in SPC-2. + */ + if (scsi_pr_registrant_is_holder(lun, reg)) { + return 1; + } + + if (lun->reservation.rtype == SPDK_SCSI_PR_WRITE_EXCLUSIVE_REGS_ONLY || + lun->reservation.rtype == SPDK_SCSI_PR_EXCLUSIVE_ACCESS_REGS_ONLY) { + return 1; + } + + conflict = true; + } else { + /* + * From spc2r20 5.5.1 Reservations overview: + * + * If a logical unit has executed a PERSISTENT RESERVE OUT + * command with the REGISTER or the REGISTER AND IGNORE + * EXISTING KEY service action and is still registered by any + * initiator, all RESERVE commands and all RELEASE commands + * regardless of initiator shall conflict and shall terminate + * with a RESERVATION CONFLICT status. + */ + conflict = TAILQ_EMPTY(&lun->reg_head) ? false : true; + } + + if (conflict) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; + } + + return 0; +} + +int +scsi2_reserve(struct spdk_scsi_task *task, uint8_t *cdb) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_scsi_pr_registrant *reg = &lun->scsi2_holder; + int ret; + + /* Obsolete Bits and LongID set, returning ILLEGAL_REQUEST */ + if (cdb[1] & 0x3) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; + } + + ret = scsi2_check_reservation_conflict(task); + /* PERSISTENT RESERVE is enabled */ + if (ret == 1) { + return 0; + } else if (ret < 0) { + return ret; + } + + /* SPC2 RESERVE */ + reg->initiator_port = task->initiator_port; + if (task->initiator_port) { + snprintf(reg->initiator_port_name, sizeof(reg->initiator_port_name), "%s", + task->initiator_port->name); + reg->transport_id_len = task->initiator_port->transport_id_len; + memcpy(reg->transport_id, task->initiator_port->transport_id, + reg->transport_id_len); + } + reg->target_port = task->target_port; + if (task->target_port) { + snprintf(reg->target_port_name, sizeof(reg->target_port_name), "%s", + task->target_port->name); + } + + lun->reservation.flags = SCSI_SPC2_RESERVE; + lun->reservation.holder = &lun->scsi2_holder; + + return 0; +} + +int +scsi2_release(struct spdk_scsi_task *task) +{ + struct spdk_scsi_lun *lun = task->lun; + int ret; + + ret = scsi2_check_reservation_conflict(task); + /* PERSISTENT RESERVE is enabled */ + if (ret == 1) { + return 0; + } else if (ret < 0) { + return ret; + } + + assert(lun->reservation.flags & SCSI_SPC2_RESERVE); + + memset(&lun->reservation, 0, sizeof(struct spdk_scsi_pr_reservation)); + memset(&lun->scsi2_holder, 0, sizeof(struct spdk_scsi_pr_registrant)); + + return 0; +} + +int scsi2_reserve_check(struct spdk_scsi_task *task) +{ + struct spdk_scsi_lun *lun = task->lun; + uint8_t *cdb = task->cdb; + + switch (cdb[0]) { + case SPDK_SPC_INQUIRY: + case SPDK_SPC2_RELEASE_6: + case SPDK_SPC2_RELEASE_10: + return 0; + + default: + break; + } + + /* no reservation holders */ + if (!scsi_pr_has_reservation(lun)) { + return 0; + } + + if (scsi2_it_nexus_is_holder(lun, task->initiator_port, task->target_port)) { + return 0; + } + + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; +} diff --git a/lib/sock/Makefile b/lib/sock/Makefile index 38bf8017530..b063bf281de 100644 --- a/lib/sock/Makefile +++ b/lib/sock/Makefile @@ -34,10 +34,10 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk -SO_VER := 3 -SO_MINOR := 1 +SO_VER := 5 +SO_MINOR := 0 -C_SRCS = sock.c net_framework.c +C_SRCS = sock.c net_framework.c sock_rpc.c LIBNAME = sock diff --git a/lib/sock/sock.c b/lib/sock/sock.c index 096cefd9ae8..6c407c71eed 100644 --- a/lib/sock/sock.c +++ b/lib/sock/sock.c @@ -146,13 +146,29 @@ sock_remove_sock_group_from_map_table(struct spdk_sock_group *group) } +static int +sock_get_placement_id(struct spdk_sock *sock) +{ + int rc; + int placement_id; + + if (!sock->placement_id) { + rc = sock->net_impl->get_placement_id(sock, &placement_id); + if (!rc && (placement_id != 0)) { + sock->placement_id = placement_id; + } + } + + return sock->placement_id; +} + int spdk_sock_get_optimal_sock_group(struct spdk_sock *sock, struct spdk_sock_group **group) { - int placement_id = 0, rc; + int placement_id; - rc = sock->net_impl->get_placement_id(sock, &placement_id); - if (!rc && (placement_id != 0)) { + placement_id = sock_get_placement_id(sock); + if (placement_id != 0) { sock_map_lookup(placement_id, group); return 0; } else { @@ -508,8 +524,8 @@ spdk_sock_group_add_sock(struct spdk_sock_group *group, struct spdk_sock *sock, return -1; } - rc = sock->net_impl->get_placement_id(sock, &placement_id); - if (!rc && (placement_id != 0)) { + placement_id = sock_get_placement_id(sock); + if (placement_id != 0) { rc = sock_map_insert(placement_id, group); if (rc < 0) { return -1; @@ -557,8 +573,8 @@ spdk_sock_group_remove_sock(struct spdk_sock_group *group, struct spdk_sock *soc assert(group_impl == sock->group_impl); - rc = sock->net_impl->get_placement_id(sock, &placement_id); - if (!rc && (placement_id != 0)) { + placement_id = sock_get_placement_id(sock); + if (placement_id != 0) { sock_map_release(placement_id); } @@ -751,6 +767,42 @@ spdk_sock_impl_set_opts(const char *impl_name, const struct spdk_sock_impl_opts return impl->set_opts(opts, len); } +void +spdk_sock_write_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_net_impl *impl; + struct spdk_sock_impl_opts opts; + size_t len; + + assert(w != NULL); + + spdk_json_write_array_begin(w); + + STAILQ_FOREACH(impl, &g_net_impls, link) { + if (!impl->get_opts) { + continue; + } + + len = sizeof(opts); + if (impl->get_opts(&opts, &len) == 0) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "sock_impl_set_options"); + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "impl_name", impl->name); + spdk_json_write_named_uint32(w, "recv_buf_size", opts.recv_buf_size); + spdk_json_write_named_uint32(w, "send_buf_size", opts.send_buf_size); + spdk_json_write_named_bool(w, "enable_recv_pipe", opts.enable_recv_pipe); + spdk_json_write_named_bool(w, "enable_zerocopy_send", opts.enable_zerocopy_send); + spdk_json_write_object_end(w); + spdk_json_write_object_end(w); + } else { + SPDK_ERRLOG("Failed to get socket options for socket implementation %s\n", impl->name); + } + } + + spdk_json_write_array_end(w); +} + void spdk_net_impl_register(struct spdk_net_impl *impl, int priority) { diff --git a/lib/sock/sock_rpc.c b/lib/sock/sock_rpc.c new file mode 100644 index 00000000000..c4e3e653ea8 --- /dev/null +++ b/lib/sock/sock_rpc.c @@ -0,0 +1,172 @@ +/*- + * BSD LICENSE + * + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/sock.h" + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + + +static const struct spdk_json_object_decoder rpc_sock_impl_get_opts_decoders[] = { + { "impl_name", 0, spdk_json_decode_string, false }, +}; + +static void +rpc_sock_impl_get_options(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + char *impl_name = NULL; + struct spdk_sock_impl_opts sock_opts = {}; + struct spdk_json_write_ctx *w; + size_t len; + int rc; + + if (spdk_json_decode_object(params, rpc_sock_impl_get_opts_decoders, + SPDK_COUNTOF(rpc_sock_impl_get_opts_decoders), &impl_name)) { + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + len = sizeof(sock_opts); + rc = spdk_sock_impl_get_opts(impl_name, &sock_opts, &len); + if (rc) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_object_begin(w); + spdk_json_write_named_uint32(w, "recv_buf_size", sock_opts.recv_buf_size); + spdk_json_write_named_uint32(w, "send_buf_size", sock_opts.send_buf_size); + spdk_json_write_named_bool(w, "enable_recv_pipe", sock_opts.enable_recv_pipe); + spdk_json_write_named_bool(w, "enable_zerocopy_send", sock_opts.enable_zerocopy_send); + spdk_json_write_named_bool(w, "enable_quickack", sock_opts.enable_quickack); + spdk_json_write_named_bool(w, "enable_placement_id", sock_opts.enable_placement_id); + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(request, w); + free(impl_name); +} +SPDK_RPC_REGISTER("sock_impl_get_options", rpc_sock_impl_get_options, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) + +struct spdk_rpc_sock_impl_set_opts { + char *impl_name; + struct spdk_sock_impl_opts sock_opts; +}; + +static const struct spdk_json_object_decoder rpc_sock_impl_set_opts_decoders[] = { + { + "impl_name", offsetof(struct spdk_rpc_sock_impl_set_opts, impl_name), + spdk_json_decode_string, false + }, + { + "recv_buf_size", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.recv_buf_size), + spdk_json_decode_uint32, true + }, + { + "send_buf_size", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.send_buf_size), + spdk_json_decode_uint32, true + }, + { + "enable_recv_pipe", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.enable_recv_pipe), + spdk_json_decode_bool, true + }, + { + "enable_zerocopy_send", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.enable_zerocopy_send), + spdk_json_decode_bool, true + }, + { + "enable_quickack", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.enable_quickack), + spdk_json_decode_bool, true + }, + { + "enable_placement_id", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.enable_placement_id), + spdk_json_decode_bool, true + }, + +}; + +static void +rpc_sock_impl_set_options(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_rpc_sock_impl_set_opts opts = {}; + struct spdk_json_write_ctx *w; + size_t len; + int rc; + + /* Get type */ + if (spdk_json_decode_object(params, rpc_sock_impl_set_opts_decoders, + SPDK_COUNTOF(rpc_sock_impl_set_opts_decoders), &opts)) { + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + /* Retrieve default opts for requested socket implementation */ + len = sizeof(opts.sock_opts); + rc = spdk_sock_impl_get_opts(opts.impl_name, &opts.sock_opts, &len); + if (rc) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + /* Decode opts */ + if (spdk_json_decode_object(params, rpc_sock_impl_set_opts_decoders, + SPDK_COUNTOF(rpc_sock_impl_set_opts_decoders), &opts)) { + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + rc = spdk_sock_impl_set_opts(opts.impl_name, &opts.sock_opts, sizeof(opts.sock_opts)); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + free(opts.impl_name); +} +SPDK_RPC_REGISTER("sock_impl_set_options", rpc_sock_impl_set_options, SPDK_RPC_STARTUP) diff --git a/lib/sock/spdk_sock.map b/lib/sock/spdk_sock.map index 20d6fd93b73..e3fb442810d 100644 --- a/lib/sock/spdk_sock.map +++ b/lib/sock/spdk_sock.map @@ -31,6 +31,7 @@ spdk_sock_get_optimal_sock_group; spdk_sock_impl_get_opts; spdk_sock_impl_set_opts; + spdk_sock_write_config_json; # public functions in spdk/net.h spdk_net_framework_register; diff --git a/lib/util/Makefile b/lib/util/Makefile index 23f8db6d002..cfacef242d7 100644 --- a/lib/util/Makefile +++ b/lib/util/Makefile @@ -35,7 +35,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk SO_VER := 2 -SO_MINOR := 0 +SO_MINOR := 1 C_SRCS = base64.c bit_array.c cpuset.c crc16.c crc32.c crc32c.c crc32_ieee.c \ dif.c fd.c file.c iov.c math.c pipe.c strerror_tls.c string.c uuid.c diff --git a/lib/util/bit_array.c b/lib/util/bit_array.c index 43c1a4d9bb0..c85be9d744e 100644 --- a/lib/util/bit_array.c +++ b/lib/util/bit_array.c @@ -34,6 +34,7 @@ #include "spdk/stdinc.h" #include "spdk/bit_array.h" +#include "spdk/bit_pool.h" #include "spdk/env.h" #include "spdk/likely.h" @@ -361,3 +362,160 @@ spdk_bit_array_clear_mask(struct spdk_bit_array *ba) spdk_bit_array_clear(ba, i + size * CHAR_BIT); } } + +struct spdk_bit_pool { + struct spdk_bit_array *array; + uint32_t lowest_free_bit; + uint32_t free_count; +}; + +struct spdk_bit_pool * +spdk_bit_pool_create(uint32_t num_bits) +{ + struct spdk_bit_pool *pool = NULL; + struct spdk_bit_array *array; + + array = spdk_bit_array_create(num_bits); + if (array == NULL) { + return NULL; + } + + pool = calloc(1, sizeof(*pool)); + if (pool == NULL) { + spdk_bit_array_free(&array); + return NULL; + } + + pool->array = array; + pool->lowest_free_bit = 0; + pool->free_count = num_bits; + + return pool; +} + +struct spdk_bit_pool * +spdk_bit_pool_create_from_array(struct spdk_bit_array *array) +{ + struct spdk_bit_pool *pool = NULL; + + pool = calloc(1, sizeof(*pool)); + if (pool == NULL) { + return NULL; + } + + pool->array = array; + pool->lowest_free_bit = spdk_bit_array_find_first_clear(array, 0); + pool->free_count = spdk_bit_array_count_clear(array); + + return pool; +} + +void +spdk_bit_pool_free(struct spdk_bit_pool **ppool) +{ + struct spdk_bit_pool *pool; + + if (!ppool) { + return; + } + + pool = *ppool; + *ppool = NULL; + if (pool != NULL) { + spdk_bit_array_free(&pool->array); + free(pool); + } +} + +int +spdk_bit_pool_resize(struct spdk_bit_pool **ppool, uint32_t num_bits) +{ + struct spdk_bit_pool *pool; + int rc; + + assert(ppool != NULL); + + pool = *ppool; + rc = spdk_bit_array_resize(&pool->array, num_bits); + if (rc) { + return rc; + } + + pool->lowest_free_bit = spdk_bit_array_find_first_clear(pool->array, 0); + pool->free_count = spdk_bit_array_count_clear(pool->array); + + return 0; +} + +uint32_t +spdk_bit_pool_capacity(const struct spdk_bit_pool *pool) +{ + return spdk_bit_array_capacity(pool->array); +} + +bool +spdk_bit_pool_is_allocated(const struct spdk_bit_pool *pool, uint32_t bit_index) +{ + return spdk_bit_array_get(pool->array, bit_index); +} + +uint32_t +spdk_bit_pool_allocate_bit(struct spdk_bit_pool *pool) +{ + uint32_t bit_index = pool->lowest_free_bit; + + if (bit_index == UINT32_MAX) { + return UINT32_MAX; + } + + spdk_bit_array_set(pool->array, bit_index); + pool->lowest_free_bit = spdk_bit_array_find_first_clear(pool->array, bit_index); + pool->free_count--; + return bit_index; +} + +void +spdk_bit_pool_free_bit(struct spdk_bit_pool *pool, uint32_t bit_index) +{ + assert(spdk_bit_array_get(pool->array, bit_index) == true); + + spdk_bit_array_clear(pool->array, bit_index); + if (pool->lowest_free_bit > bit_index) { + pool->lowest_free_bit = bit_index; + } + pool->free_count++; +} + +uint32_t +spdk_bit_pool_count_allocated(const struct spdk_bit_pool *pool) +{ + return spdk_bit_array_capacity(pool->array) - pool->free_count; +} + +uint32_t +spdk_bit_pool_count_free(const struct spdk_bit_pool *pool) +{ + return pool->free_count; +} + +void +spdk_bit_pool_store_mask(const struct spdk_bit_pool *pool, void *mask) +{ + spdk_bit_array_store_mask(pool->array, mask); +} + +void +spdk_bit_pool_load_mask(struct spdk_bit_pool *pool, const void *mask) +{ + spdk_bit_array_load_mask(pool->array, mask); + pool->lowest_free_bit = spdk_bit_array_find_first_clear(pool->array, 0); + pool->free_count = spdk_bit_array_count_clear(pool->array); +} + +void +spdk_bit_pool_free_all_bits(struct spdk_bit_pool *pool) +{ + spdk_bit_array_clear_mask(pool->array); + pool->lowest_free_bit = 0; + pool->free_count = spdk_bit_array_capacity(pool->array); +} diff --git a/lib/util/cpuset.c b/lib/util/cpuset.c index 8d7c8dc89a9..376b0330eb9 100644 --- a/lib/util/cpuset.c +++ b/lib/util/cpuset.c @@ -293,7 +293,7 @@ parse_mask(const char *mask, struct spdk_cpuset *set, size_t len) SPDK_ERRLOG("Invalid character in core mask '%s' (%c)\n", mask, c); return -1; } - for (j = 0; j < 4 && lcore < sizeof(set->cpus); j++, lcore++) { + for (j = 0; j < 4 && lcore < SPDK_CPUSET_SIZE; j++, lcore++) { if ((1 << j) & val) { spdk_cpuset_set_cpu(set, lcore, true); } diff --git a/lib/util/spdk_util.map b/lib/util/spdk_util.map index 07e067faaea..118f7511daf 100644 --- a/lib/util/spdk_util.map +++ b/lib/util/spdk_util.map @@ -23,6 +23,21 @@ spdk_bit_array_load_mask; spdk_bit_array_clear_mask; + # public functions in bit_pool.h + spdk_bit_pool_capacity; + spdk_bit_pool_create; + spdk_bit_pool_create_from_array; + spdk_bit_pool_free; + spdk_bit_pool_resize; + spdk_bit_pool_is_allocated; + spdk_bit_pool_allocate_bit; + spdk_bit_pool_free_bit; + spdk_bit_pool_count_allocated; + spdk_bit_pool_count_free; + spdk_bit_pool_store_mask; + spdk_bit_pool_load_mask; + spdk_bit_pool_free_all_bits; + # public functions in cpuset.h spdk_cpuset_alloc; spdk_cpuset_free; diff --git a/lib/vhost/rte_vhost_compat.c b/lib/vhost/rte_vhost_compat.c index 53f31bfd754..d73060d3eda 100644 --- a/lib/vhost/rte_vhost_compat.c +++ b/lib/vhost/rte_vhost_compat.c @@ -175,7 +175,6 @@ extern_vhost_pre_msg_handler(int vid, void *_msg) case VHOST_USER_SET_VRING_BASE: case VHOST_USER_SET_VRING_ADDR: case VHOST_USER_SET_VRING_NUM: - case VHOST_USER_SET_VRING_KICK: if (vsession->forced_polling && vsession->started) { /* Additional queues are being initialized, so we either processed * enough I/Os and are switching from SeaBIOS to the OS now, or @@ -186,6 +185,12 @@ extern_vhost_pre_msg_handler(int vid, void *_msg) vsession->forced_polling = false; } break; + case VHOST_USER_SET_VRING_KICK: + /* rte_vhost(after 20.08) will call new_device after one active vring is + * configured, we will start the session before all vrings are available, + * so for each new vring, if the session is started, we need to restart it + * again. + */ case VHOST_USER_SET_VRING_CALL: /* rte_vhost will close the previous callfd and won't notify * us about any change. This will effectively make SPDK fail diff --git a/lib/vhost/vhost_blk.c b/lib/vhost/vhost_blk.c index e722823c21a..d387cb27d82 100644 --- a/lib/vhost/vhost_blk.c +++ b/lib/vhost/vhost_blk.c @@ -44,6 +44,7 @@ #include "spdk/vhost.h" #include "vhost_internal.h" +#include /* Minimal set of features supported by every SPDK VHOST-BLK device */ #define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \ @@ -688,7 +689,7 @@ vdev_worker(void *arg) vhost_session_used_signal(vsession); - return -1; + return SPDK_POLLER_BUSY; } static void @@ -776,7 +777,7 @@ no_bdev_vdev_worker(void *arg) bvsession->io_channel = NULL; } - return -1; + return SPDK_POLLER_BUSY; } static struct spdk_vhost_blk_session * @@ -801,6 +802,32 @@ to_blk_dev(struct spdk_vhost_dev *vdev) return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev); } +static int +vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, + void *ctx) +{ +#if RTE_VERSION >= RTE_VERSION_NUM(20, 02, 0, 0) + SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid); + rte_vhost_slave_config_change(vsession->vid, false); +#else + SPDK_NOTICELOG("bdev does not support resize until DPDK submodule version >= 20.02\n"); +#endif + + return 0; +} + +static void +blk_resize_cb(void *resize_ctx) +{ + struct spdk_vhost_blk_dev *bvdev = resize_ctx; + + spdk_vhost_lock(); + vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_resize_cb, + NULL, NULL); + spdk_vhost_unlock(); +} + static void vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx) { @@ -845,6 +872,29 @@ bdev_remove_cb(void *remove_ctx) spdk_vhost_unlock(); } +static void +bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, + void *event_ctx) +{ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Bdev event: type %d, name %s\n", + type, + bdev->name); + + switch (type) { + case SPDK_BDEV_EVENT_REMOVE: + SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name); + bdev_remove_cb(event_ctx); + break; + case SPDK_BDEV_EVENT_RESIZE: + SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name); + blk_resize_cb(event_ctx); + break; + default: + SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); + break; + } +} + static void free_task_pool(struct spdk_vhost_blk_session *bvsession) { @@ -972,11 +1022,11 @@ destroy_session_poller_cb(void *arg) int i; if (vsession->task_cnt > 0) { - return -1; + return SPDK_POLLER_BUSY; } if (spdk_vhost_trylock() != 0) { - return -1; + return SPDK_POLLER_BUSY; } for (i = 0; i < vsession->max_queues; i++) { @@ -997,7 +1047,7 @@ destroy_session_poller_cb(void *arg) vhost_session_stop_done(vsession, 0); spdk_vhost_unlock(); - return -1; + return SPDK_POLLER_BUSY; } static int @@ -1234,7 +1284,7 @@ spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_ vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH); } - ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc); + ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc); if (ret != 0) { SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n", name, dev_name, ret); @@ -1280,16 +1330,16 @@ vhost_blk_destroy(struct spdk_vhost_dev *vdev) assert(bvdev != NULL); - /* if the bdev is removed, don't need call spdk_put_io_channel. */ - if (bvdev->bdev) { - spdk_put_io_channel(bvdev->dummy_io_channel); - } - rc = vhost_dev_unregister(&bvdev->vdev); if (rc != 0) { return rc; } + /* if the bdev is removed, don't need call spdk_put_io_channel. */ + if (bvdev->bdev) { + spdk_put_io_channel(bvdev->dummy_io_channel); + } + if (bvdev->bdev_desc) { spdk_bdev_close(bvdev->bdev_desc); bvdev->bdev_desc = NULL; diff --git a/lib/vhost/vhost_nvme.c b/lib/vhost/vhost_nvme.c index 47b3ccb3634..10f53baf92b 100644 --- a/lib/vhost/vhost_nvme.c +++ b/lib/vhost/vhost_nvme.c @@ -583,11 +583,11 @@ nvme_worker(void *arg) int count = -1; if (spdk_unlikely(!nvme->num_sqs)) { - return -1; + return SPDK_POLLER_IDLE; } if (spdk_unlikely(!nvme->dataplane_started && !nvme->bar)) { - return -1; + return SPDK_POLLER_IDLE; } for (qid = 1; qid <= MAX_IO_QUEUES; qid++) { @@ -598,7 +598,7 @@ nvme_worker(void *arg) } cq = vhost_nvme_get_cq_from_qid(nvme, sq->cqid); if (spdk_unlikely(!cq)) { - return -1; + return SPDK_POLLER_BUSY; } cq->guest_signaled_cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(sq->cqid, 1)); if (spdk_unlikely(!STAILQ_EMPTY(&cq->cq_full_waited_tasks) && @@ -620,7 +620,7 @@ nvme_worker(void *arg) task = STAILQ_FIRST(&nvme->free_tasks); STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); } else { - return -1; + return SPDK_POLLER_BUSY; } task->cmd = sq->sq_cmd[sq->sq_head]; @@ -1113,7 +1113,7 @@ destroy_device_poller_cb(void *arg) /* FIXME wait for pending I/Os to complete */ if (spdk_vhost_trylock() != 0) { - return -1; + return SPDK_POLLER_BUSY; } for (i = 0; i < nvme->num_ns; i++) { @@ -1137,7 +1137,7 @@ destroy_device_poller_cb(void *arg) vhost_session_stop_done(nvme->vsession, 0); spdk_vhost_unlock(); - return -1; + return SPDK_POLLER_BUSY; } static int diff --git a/lib/vhost/vhost_scsi.c b/lib/vhost/vhost_scsi.c index e7634926eb3..35028c30f93 100644 --- a/lib/vhost/vhost_scsi.c +++ b/lib/vhost/vhost_scsi.c @@ -769,7 +769,7 @@ vdev_mgmt_worker(void *arg) process_vq(svsession, &vsession->virtqueue[VIRTIO_SCSI_CONTROLQ]); vhost_vq_used_signal(vsession, &vsession->virtqueue[VIRTIO_SCSI_CONTROLQ]); - return -1; + return SPDK_POLLER_BUSY; } static int @@ -785,7 +785,7 @@ vdev_worker(void *arg) vhost_session_used_signal(vsession); - return -1; + return SPDK_POLLER_BUSY; } static struct spdk_vhost_scsi_dev * @@ -993,7 +993,11 @@ spdk_vhost_scsi_dev_add_tgt(struct spdk_vhost_dev *vdev, int scsi_tgt_num, const char *bdev_names_list[1]; svdev = to_scsi_dev(vdev); - assert(svdev != NULL); + if (!svdev) { + SPDK_ERRLOG("Before adding a SCSI target, there should be a SCSI device."); + return -EINVAL; + } + if (scsi_tgt_num < 0) { for (scsi_tgt_num = 0; scsi_tgt_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_tgt_num++) { if (svdev->scsi_dev_state[scsi_tgt_num].dev == NULL) { @@ -1118,7 +1122,11 @@ spdk_vhost_scsi_dev_remove_tgt(struct spdk_vhost_dev *vdev, unsigned scsi_tgt_nu } svdev = to_scsi_dev(vdev); - assert(svdev != NULL); + if (!svdev) { + SPDK_ERRLOG("An invalid SCSI device that removing from a SCSI target."); + return -EINVAL; + } + scsi_dev_state = &svdev->scsi_dev_state[scsi_tgt_num]; if (scsi_dev_state->status != VHOST_SCSI_DEV_PRESENT) { @@ -1364,11 +1372,11 @@ destroy_session_poller_cb(void *arg) uint32_t i; if (vsession->task_cnt > 0) { - return -1; + return SPDK_POLLER_BUSY; } if (spdk_vhost_trylock() != 0) { - return -1; + return SPDK_POLLER_BUSY; } for (i = 0; i < vsession->max_queues; i++) { @@ -1408,7 +1416,7 @@ destroy_session_poller_cb(void *arg) vhost_session_stop_done(vsession, 0); spdk_vhost_unlock(); - return -1; + return SPDK_POLLER_BUSY; } static int diff --git a/lib/virtio/virtio_pci.c b/lib/virtio/virtio_pci.c index 646f77c1aa2..2da0b31e992 100644 --- a/lib/virtio/virtio_pci.c +++ b/lib/virtio/virtio_pci.c @@ -224,10 +224,15 @@ static void modern_destruct_dev(struct virtio_dev *vdev) { struct virtio_hw *hw = vdev->ctx; - struct spdk_pci_device *pci_dev = hw->pci_dev; + struct spdk_pci_device *pci_dev; - free_virtio_hw(hw); - spdk_pci_device_detach(pci_dev); + if (hw != NULL) { + pci_dev = hw->pci_dev; + free_virtio_hw(hw); + if (pci_dev) { + spdk_pci_device_detach(pci_dev); + } + } } static uint8_t diff --git a/lib/virtio/virtio_user.c b/lib/virtio/virtio_user.c index 4f4932db986..e45003352a0 100644 --- a/lib/virtio/virtio_user.c +++ b/lib/virtio/virtio_user.c @@ -470,7 +470,7 @@ virtio_user_setup_queue(struct virtio_dev *vdev, struct virtqueue *vq) vq->vq_ring_virt_mem = queue_mem; state.index = vq->vq_queue_index; - state.num = 0; + state.num = vq->vq_nentries; if (virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) { rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_ENABLE, &state); diff --git a/lib/vmd/vmd.c b/lib/vmd/vmd.c index 7fae7d42141..14d9558c20c 100644 --- a/lib/vmd/vmd.c +++ b/lib/vmd/vmd.c @@ -902,7 +902,6 @@ vmd_dev_init(struct vmd_pci_device *dev) dev->pci.unmap_bar = vmd_dev_unmap_bar; dev->pci.cfg_read = vmd_dev_cfg_read; dev->pci.cfg_write = vmd_dev_cfg_write; - dev->pci.detach = vmd_dev_detach; dev->hotplug_capable = false; if (dev->pcie_cap != NULL) { dev->cached_slot_control = dev->pcie_cap->slot_control; diff --git a/mk/nvme.libtest.mk b/mk/nvme.libtest.mk index 14eb7afb446..6fc8f9a4d79 100644 --- a/mk/nvme.libtest.mk +++ b/mk/nvme.libtest.mk @@ -38,9 +38,8 @@ include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk C_SRCS := $(APP:%=%.c) -# Unable to combine the FIO plugin and the VPP socket abstraction (license incompatibility) -SPDK_LIB_LIST = $(filter-out sock_vpp,$(SOCK_MODULES_LIST)) -SPDK_LIB_LIST += nvme thread util log sock vmd +SPDK_LIB_LIST = $(SOCK_MODULES_LIST) +SPDK_LIB_LIST += nvme thread util log sock vmd jsonrpc json rpc ifeq ($(CONFIG_RDMA),y) SPDK_LIB_LIST += rdma diff --git a/mk/spdk.common.mk b/mk/spdk.common.mk index 540e0868a36..d1647af7e71 100644 --- a/mk/spdk.common.mk +++ b/mk/spdk.common.mk @@ -144,15 +144,18 @@ LIBS += -L$(CONFIG_PMDK_DIR)/src/nondebug COMMON_CFLAGS += -I$(CONFIG_PMDK_DIR)/src/include endif -ifneq ($(CONFIG_VPP_DIR),) -LIBS += -L$(CONFIG_VPP_DIR)/lib64 -COMMON_CFLAGS += -I$(CONFIG_VPP_DIR)/include -endif - ifeq ($(CONFIG_RDMA),y) SYS_LIBS += -libverbs -lrdmacm endif +ifeq ($(CONFIG_URING),y) +SYS_LIBS += -luring +ifneq ($(strip $(CONFIG_URING_PATH)),) +CFLAGS += -I$(CONFIG_URING_PATH) +LDFLAGS += -L$(CONFIG_URING_PATH) +endif +endif + IPSEC_MB_DIR=$(SPDK_ROOT_DIR)/intel-ipsec-mb ISAL_DIR=$(SPDK_ROOT_DIR)/isa-l @@ -252,18 +255,21 @@ COMPILE_CXX=\ $(CXX) -o $@ $(DEPFLAGS) $(CXXFLAGS) -c $< && \ mv -f $*.d.tmp $*.d && touch -c $@ +ENV_LDFLAGS = $(if $(SPDK_NO_LINK_ENV),,$(ENV_LINKER_ARGS)) + # Link $(OBJS) and $(LIBS) into $@ (app) LINK_C=\ $(Q)echo " LINK $(notdir $@)"; \ - $(CC) -o $@ $(CPPFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) $(ENV_LINKER_ARGS) $(SYS_LIBS) + $(CC) -o $@ $(CPPFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) $(ENV_LDFLAGS) $(SYS_LIBS) LINK_CXX=\ $(Q)echo " LINK $(notdir $@)"; \ - $(CXX) -o $@ $(CPPFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) $(ENV_LINKER_ARGS) $(SYS_LIBS) + $(CXX) -o $@ $(CPPFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) $(ENV_LDFLAGS) $(SYS_LIBS) # Provide function to ease build of a shared lib define spdk_build_realname_shared_lib $(CC) -o $@ -shared $(CPPFLAGS) $(LDFLAGS) \ + -Wl,-rpath=$(DESTDIR)/$(libdir) \ -Wl,--soname,$(notdir $@) \ -Wl,--whole-archive $(1) -Wl,--no-whole-archive \ -Wl,--version-script=$(2) \ diff --git a/mk/spdk.lib_deps.mk b/mk/spdk.lib_deps.mk index 1b5f75040ef..c2ff2950d8f 100644 --- a/mk/spdk.lib_deps.mk +++ b/mk/spdk.lib_deps.mk @@ -46,7 +46,7 @@ DEPDIRS-rte_vhost := DEPDIRS-ioat := log DEPDIRS-idxd := log util -DEPDIRS-sock := log +DEPDIRS-sock := log $(JSON_LIBS) DEPDIRS-util := log DEPDIRS-vmd := log @@ -108,9 +108,12 @@ DEPDIRS-blob_bdev := log thread bdev # module/blobfs DEPDIRS-blobfs_bdev := $(BDEV_DEPS_THREAD) blob_bdev blobfs +ifeq ($(CONFIG_FUSE),y) +DEPDIRS-blobfs_bdev += event +endif # module/accel -DEPDIRS-accel_ioat := log ioat conf thread $(JSON_LIBS) accel +DEPDIRS-accel_ioat := log ioat conf thread $(JSON_LIBS) accel util DEPDIRS-accel_idxd := log idxd thread $(JSON_LIBS) accel # module/env_dpdk @@ -119,7 +122,6 @@ DEPDIRS-env_dpdk_rpc := log $(JSON_LIBS) # module/sock DEPDIRS-sock_posix := log sock util DEPDIRS-sock_uring := log sock util -DEPDIRS-sock_vpp := log sock util thread # module/bdev DEPDIRS-bdev_gpt := bdev conf json log thread util @@ -148,6 +150,7 @@ DEPDIRS-bdev_passthru := $(BDEV_DEPS_CONF_THREAD) DEPDIRS-bdev_pmem := $(BDEV_DEPS_CONF_THREAD) DEPDIRS-bdev_raid := $(BDEV_DEPS_CONF_THREAD) DEPDIRS-bdev_rbd := $(BDEV_DEPS_CONF_THREAD) +DEPDIRS-bdev_uring := $(BDEV_DEPS_CONF_THREAD) DEPDIRS-bdev_virtio := $(BDEV_DEPS_CONF_THREAD) virtio # module/event @@ -162,11 +165,12 @@ DEPDIRS-event_accel := accel event DEPDIRS-event_net := sock net event DEPDIRS-event_vmd := vmd conf $(JSON_LIBS) event log thread -DEPDIRS-event_bdev := bdev event event_accel event_vmd +DEPDIRS-event_bdev := bdev event event_accel event_vmd event_sock DEPDIRS-event_nbd := event nbd event_bdev -DEPDIRS-event_nvmf := $(BDEV_DEPS_CONF_THREAD) event nvme nvmf event_bdev +DEPDIRS-event_nvmf := $(BDEV_DEPS_CONF_THREAD) event nvme nvmf event_bdev event_sock DEPDIRS-event_scsi := event scsi event_bdev -DEPDIRS-event_iscsi := event iscsi event_scsi +DEPDIRS-event_iscsi := event iscsi event_scsi event_sock DEPDIRS-event_vhost := event vhost event_scsi +DEPDIRS-event_sock := event sock diff --git a/mk/spdk.modules.mk b/mk/spdk.modules.mk index c4e3737430b..894c27031d9 100644 --- a/mk/spdk.modules.mk +++ b/mk/spdk.modules.mk @@ -99,19 +99,11 @@ SOCK_MODULES_LIST += sock_uring endif endif -ifeq ($(CONFIG_VPP),y) -SYS_LIBS += -Wl,--whole-archive -ifneq ($(CONFIG_VPP_DIR),) -SYS_LIBS += -L$(CONFIG_VPP_DIR)/lib -endif -SYS_LIBS += -lvppinfra -lsvm -lvlibmemoryclient -SYS_LIBS += -Wl,--no-whole-archive -SOCK_MODULES_LIST += sock_vpp -endif - ACCEL_MODULES_LIST = accel_ioat ioat ifeq ($(CONFIG_IDXD),y) ACCEL_MODULES_LIST += accel_idxd idxd endif +EVENT_BDEV_SUBSYSTEM = event_bdev event_accel event_vmd event_sock + ALL_MODULES_LIST = $(BLOCKDEV_MODULES_LIST) $(ACCEL_MODULES_LIST) $(SOCK_MODULES_LIST) diff --git a/mk/spdk.nvmecli.mk b/mk/spdk.nvmecli.mk index 0b6f416efea..eb04a71a395 100644 --- a/mk/spdk.nvmecli.mk +++ b/mk/spdk.nvmecli.mk @@ -37,7 +37,7 @@ SPDK_LIB_DIR ?= $(SPDK_ROOT_DIR)/build/lib include $(SPDK_ROOT_DIR)/mk/config.mk DPDK_LIB_DIR ?= $(CONFIG_DPDK_DIR)/lib -DPDK_LIB_LIST = -lrte_eal -lrte_mempool -lrte_ring -lrte_pci -lrte_bus_pci +DPDK_LIB_LIST = -lrte_eal -lrte_mempool -lrte_ring -lrte_pci -lrte_bus_pci -lrte_mbuf ifneq (, $(wildcard $(DPDK_LIB_DIR)/librte_kvargs.*)) DPDK_LIB_LIST += -lrte_kvargs @@ -47,12 +47,28 @@ ifneq (, $(wildcard $(DPDK_LIB_DIR)/librte_power.*)) DPDK_LIB_LIST += -lrte_power endif -NVMECLI_SPDK_LIBS = -lspdk_log -lspdk_sock -lspdk_nvme -lspdk_env_dpdk -lspdk_util +ifneq (, $(wildcard $(DPDK_LIB_DIR)/librte_telemetry.*)) +DPDK_LIB_LIST += -lrte_telemetry +endif + +NVMECLI_SPDK_LIBS = -lspdk_log -lspdk_sock -lspdk_nvme -lspdk_env_dpdk -lspdk_util -lspdk_jsonrpc -lspdk_json -lspdk_rpc ifeq ($(CONFIG_RDMA),y) NVMECLI_SPDK_LIBS += -lspdk_rdma endif +ifeq ($(CONFIG_OCF),y) +NVMECLI_SPDK_LIBS += -lspdk_ocfenv +endif + +ifeq ($(CONFIG_VHOST),y) +ifneq ($(CONFIG_VHOST_INTERNAL_LIB),y) +DPDK_LIB_LIST += -lrte_vhost -lrte_net -lrte_cryptodev -lrte_hash +else +NVMECLI_SPDK_LIBS += -lrte_vhost +endif +endif + override CFLAGS += -I$(SPDK_ROOT_DIR)/include override LDFLAGS += \ -Wl,--whole-archive \ @@ -86,3 +102,7 @@ ifeq ($(CONFIG_COVERAGE), y) override CFLAGS += -fprofile-arcs -ftest-coverage override LDFLAGS += -fprofile-arcs -ftest-coverage endif + +ifeq ($(CONFIG_ISCSI_INITIATOR),y) +override LDFLAGS += -L/usr/lib64/iscsi -liscsi +endif diff --git a/module/accel/idxd/accel_engine_idxd.c b/module/accel/idxd/accel_engine_idxd.c index 6bfee2c05e9..bb590ac0e63 100644 --- a/module/accel/idxd/accel_engine_idxd.c +++ b/module/accel/idxd/accel_engine_idxd.c @@ -47,15 +47,9 @@ #include "spdk/util.h" #include "spdk/json.h" -/* Undefine this to require an RPC to enable IDXD. */ -#undef DEVELOPER_DEBUG_MODE +#define ALIGN_4K 0x1000 -#ifdef DEVELOPER_DEBUG_MODE -static bool g_idxd_enable = true; -#else static bool g_idxd_enable = false; -#endif - uint32_t g_config_number; enum channel_state { @@ -94,6 +88,7 @@ struct idxd_op { uint64_t fill_pattern; uint32_t op_code; uint64_t nbytes; + struct idxd_batch *batch; TAILQ_ENTRY(idxd_op) link; }; @@ -110,7 +105,7 @@ struct idxd_task { spdk_accel_completion_cb cb; }; -pthread_mutex_t g_num_channels_lock = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t g_configuration_lock = PTHREAD_MUTEX_INITIALIZER; static struct spdk_io_channel *idxd_get_io_channel(void); @@ -121,6 +116,7 @@ idxd_select_device(void) * We allow channels to share underlying devices, * selection is round-robin based. */ + g_next_dev = TAILQ_NEXT(g_next_dev, tailq); if (g_next_dev == NULL) { g_next_dev = TAILQ_FIRST(&g_idxd_devices); @@ -144,7 +140,6 @@ idxd_poll(void *arg) while (!TAILQ_EMPTY(&chan->queued_ops)) { op = TAILQ_FIRST(&chan->queued_ops); - TAILQ_REMOVE(&chan->queued_ops, op, link); switch (op->op_code) { case IDXD_OPCODE_MEMMOVE: @@ -167,16 +162,19 @@ idxd_poll(void *arg) rc = spdk_idxd_submit_crc32c(op->chan, op->dst, op->src, op->seed, op->nbytes, op->cb_fn, op->cb_arg); break; + case IDXD_OPCODE_BATCH: + rc = spdk_idxd_batch_submit(op->chan, op->batch, op->cb_fn, op->cb_arg); + break; default: /* Should never get here */ assert(false); break; } if (rc == 0) { + TAILQ_REMOVE(&chan->queued_ops, op, link); free(op); } else { /* Busy, resubmit to try again later */ - TAILQ_INSERT_HEAD(&chan->queued_ops, op, link); break; } } @@ -193,13 +191,13 @@ accel_engine_idxd_get_ctx_size(void) static void idxd_done(void *cb_arg, int status) { - struct spdk_accel_task *accel_req; + struct spdk_accel_task *accel_task; struct idxd_task *idxd_task = cb_arg; - accel_req = SPDK_CONTAINEROF(idxd_task, struct spdk_accel_task, - offload_ctx); + accel_task = SPDK_CONTAINEROF(idxd_task, struct spdk_accel_task, + offload_ctx); - idxd_task->cb(accel_req, status); + idxd_task->cb(accel_task, status); } static struct idxd_op * @@ -221,14 +219,14 @@ _prep_queue_command(struct idxd_io_channel *chan, spdk_accel_completion_cb cb_fn } static int -idxd_submit_copy(void *cb_arg, struct spdk_io_channel *ch, void *dst, void *src, uint64_t nbytes, - spdk_accel_completion_cb cb) +idxd_submit_copy(struct spdk_io_channel *ch, void *dst, void *src, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) { struct idxd_task *idxd_task = (struct idxd_task *)cb_arg; struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch); int rc = 0; - idxd_task->cb = cb; + idxd_task->cb = cb_fn; if (chan->state == IDXD_CHANNEL_ACTIVE) { rc = spdk_idxd_submit_copy(chan->chan, dst, src, nbytes, idxd_done, idxd_task); @@ -261,15 +259,14 @@ idxd_submit_copy(void *cb_arg, struct spdk_io_channel *ch, void *dst, void *src, } static int -idxd_submit_dualcast(void *cb_arg, struct spdk_io_channel *ch, void *dst1, void *dst2, void *src, - uint64_t nbytes, - spdk_accel_completion_cb cb) +idxd_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2, void *src, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) { struct idxd_task *idxd_task = (struct idxd_task *)cb_arg; struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch); int rc = 0; - idxd_task->cb = cb; + idxd_task->cb = cb_fn; if (chan->state == IDXD_CHANNEL_ACTIVE) { rc = spdk_idxd_submit_dualcast(chan->chan, dst1, dst2, src, nbytes, idxd_done, idxd_task); @@ -303,15 +300,14 @@ idxd_submit_dualcast(void *cb_arg, struct spdk_io_channel *ch, void *dst1, void } static int -idxd_submit_compare(void *cb_arg, struct spdk_io_channel *ch, void *src1, void *src2, - uint64_t nbytes, - spdk_accel_completion_cb cb) +idxd_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) { struct idxd_task *idxd_task = (struct idxd_task *)cb_arg; struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch); int rc = 0; - idxd_task->cb = cb; + idxd_task->cb = cb_fn; if (chan->state == IDXD_CHANNEL_ACTIVE) { rc = spdk_idxd_submit_compare(chan->chan, src1, src2, nbytes, idxd_done, idxd_task); @@ -344,15 +340,15 @@ idxd_submit_compare(void *cb_arg, struct spdk_io_channel *ch, void *src1, void * } static int -idxd_submit_fill(void *cb_arg, struct spdk_io_channel *ch, void *dst, uint8_t fill, - uint64_t nbytes, spdk_accel_completion_cb cb) +idxd_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) { struct idxd_task *idxd_task = (struct idxd_task *)cb_arg; struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch); int rc = 0; uint64_t fill_pattern; - idxd_task->cb = cb; + idxd_task->cb = cb_fn; memset(&fill_pattern, fill, sizeof(uint64_t)); if (chan->state == IDXD_CHANNEL_ACTIVE) { @@ -386,14 +382,14 @@ idxd_submit_fill(void *cb_arg, struct spdk_io_channel *ch, void *dst, uint8_t fi } static int -idxd_submit_crc32c(void *cb_arg, struct spdk_io_channel *ch, uint32_t *dst, void *src, - uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb) +idxd_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src, + uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) { struct idxd_task *idxd_task = (struct idxd_task *)cb_arg; struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch); int rc = 0; - idxd_task->cb = cb; + idxd_task->cb = cb_fn; if (chan->state == IDXD_CHANNEL_ACTIVE) { rc = spdk_idxd_submit_crc32c(chan->chan, dst, src, seed, nbytes, idxd_done, idxd_task); @@ -430,12 +426,157 @@ static uint64_t idxd_get_capabilities(void) { return ACCEL_COPY | ACCEL_FILL | ACCEL_CRC32C | ACCEL_COMPARE | - ACCEL_DUALCAST; + ACCEL_DUALCAST | ACCEL_BATCH; +} + +static uint32_t +idxd_batch_get_max(void) +{ + return spdk_idxd_batch_get_max(); +} + +static struct spdk_accel_batch * +idxd_batch_start(struct spdk_io_channel *ch) +{ + struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch); + + return (struct spdk_accel_batch *)spdk_idxd_batch_create(chan->chan); +} + +static int +idxd_batch_cancel(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch) +{ + struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch); + struct idxd_batch *batch = (struct idxd_batch *)_batch; + + return spdk_idxd_batch_cancel(chan->chan, batch); +} + +static int +idxd_batch_submit(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct idxd_task *idxd_task = (struct idxd_task *)cb_arg; + struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch); + struct idxd_batch *batch = (struct idxd_batch *)_batch; + int rc = 0; + + idxd_task->cb = cb_fn; + + if (chan->state == IDXD_CHANNEL_ACTIVE) { + rc = spdk_idxd_batch_submit(chan->chan, batch, idxd_done, idxd_task); + } + + if (chan->state == IDXD_CHANNEL_PAUSED || rc == -EBUSY) { + struct idxd_op *op_to_queue; + + /* Commpom prep. */ + op_to_queue = _prep_queue_command(chan, idxd_done, idxd_task); + if (op_to_queue == NULL) { + return -ENOMEM; + } + + /* Command specific. */ + op_to_queue->batch = batch; + op_to_queue->op_code = IDXD_OPCODE_BATCH; + + /* Queue the operation. */ + TAILQ_INSERT_TAIL(&chan->queued_ops, op_to_queue, link); + return 0; + + } else if (chan->state == IDXD_CHANNEL_ERROR) { + return -EINVAL; + } + + return rc; +} + +static int +idxd_batch_prep_copy(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch, + void *dst, void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct idxd_task *idxd_task = (struct idxd_task *)cb_arg; + struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch); + struct idxd_batch *batch = (struct idxd_batch *)_batch; + + idxd_task->cb = cb_fn; + + return spdk_idxd_batch_prep_copy(chan->chan, batch, dst, src, nbytes, + idxd_done, idxd_task); +} + +static int +idxd_batch_prep_fill(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch, + void *dst, uint8_t fill, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct idxd_task *idxd_task = (struct idxd_task *)cb_arg; + struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch); + uint64_t fill_pattern; + struct idxd_batch *batch = (struct idxd_batch *)_batch; + + idxd_task->cb = cb_fn; + memset(&fill_pattern, fill, sizeof(uint64_t)); + + return spdk_idxd_batch_prep_fill(chan->chan, batch, dst, fill_pattern, nbytes, idxd_done, + idxd_task); +} + +static int +idxd_batch_prep_dualcast(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch, + void *dst1, void *dst2, void *src, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct idxd_task *idxd_task = (struct idxd_task *)cb_arg; + struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch); + struct idxd_batch *batch = (struct idxd_batch *)_batch; + + idxd_task->cb = cb_fn; + + return spdk_idxd_batch_prep_dualcast(chan->chan, batch, dst1, dst2, src, nbytes, idxd_done, + idxd_task); +} + +static int +idxd_batch_prep_crc32c(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch, + uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct idxd_task *idxd_task = (struct idxd_task *)cb_arg; + struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch); + struct idxd_batch *batch = (struct idxd_batch *)_batch; + + idxd_task->cb = cb_fn; + + return spdk_idxd_batch_prep_crc32c(chan->chan, batch, dst, src, seed, nbytes, idxd_done, + idxd_task); +} + +static int +idxd_batch_prep_compare(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch, + void *src1, void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct idxd_task *idxd_task = (struct idxd_task *)cb_arg; + struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch); + struct idxd_batch *batch = (struct idxd_batch *)_batch; + + idxd_task->cb = cb_fn; + + return spdk_idxd_batch_prep_compare(chan->chan, batch, src1, src2, nbytes, idxd_done, + idxd_task); } static struct spdk_accel_engine idxd_accel_engine = { .get_capabilities = idxd_get_capabilities, .copy = idxd_submit_copy, + .batch_get_max = idxd_batch_get_max, + .batch_create = idxd_batch_start, + .batch_cancel = idxd_batch_cancel, + .batch_prep_copy = idxd_batch_prep_copy, + .batch_prep_fill = idxd_batch_prep_fill, + .batch_prep_dualcast = idxd_batch_prep_dualcast, + .batch_prep_crc32c = idxd_batch_prep_crc32c, + .batch_prep_compare = idxd_batch_prep_compare, + .batch_submit = idxd_batch_submit, .dualcast = idxd_submit_dualcast, .compare = idxd_submit_compare, .fill = idxd_submit_fill, @@ -458,9 +599,9 @@ _config_max_desc(struct spdk_io_channel_iter *i) ch = spdk_io_channel_iter_get_channel(i); chan = spdk_io_channel_get_ctx(ch); - pthread_mutex_lock(&g_num_channels_lock); + pthread_mutex_lock(&g_configuration_lock); rc = spdk_idxd_reconfigure_chan(chan->chan, chan->dev->num_channels); - pthread_mutex_unlock(&g_num_channels_lock); + pthread_mutex_unlock(&g_configuration_lock); if (rc == 0) { chan->state = IDXD_CHANNEL_ACTIVE; } else { @@ -489,8 +630,7 @@ _pause_chan(struct spdk_io_channel_iter *i) static void _pause_chan_done(struct spdk_io_channel_iter *i, int status) { - spdk_for_each_channel(&idxd_accel_engine, _config_max_desc, NULL, - NULL); + spdk_for_each_channel(&idxd_accel_engine, _config_max_desc, NULL, NULL); } static int @@ -521,18 +661,19 @@ idxd_create_cb(void *io_device, void *ctx_buf) * channels. This enables dynamic load balancing for HW * flow control. */ + pthread_mutex_lock(&g_configuration_lock); rc = spdk_idxd_configure_chan(chan->chan); if (rc) { SPDK_ERRLOG("Failed to configure new channel rc = %d\n", rc); chan->state = IDXD_CHANNEL_ERROR; spdk_poller_unregister(&chan->poller); + pthread_mutex_unlock(&g_configuration_lock); return rc; } chan->state = IDXD_CHANNEL_PAUSED; - pthread_mutex_lock(&g_num_channels_lock); chan->dev->num_channels++; - pthread_mutex_unlock(&g_num_channels_lock); + pthread_mutex_unlock(&g_configuration_lock); /* * Pause all channels so that we can set proper flow control @@ -549,8 +690,7 @@ static void _pause_chan_destroy_done(struct spdk_io_channel_iter *i, int status) { /* Rebalance the rings with the smaller number of remaining channels. */ - spdk_for_each_channel(&idxd_accel_engine, _config_max_desc, NULL, - NULL); + spdk_for_each_channel(&idxd_accel_engine, _config_max_desc, NULL, NULL); } static void @@ -558,12 +698,12 @@ idxd_destroy_cb(void *io_device, void *ctx_buf) { struct idxd_io_channel *chan = ctx_buf; - pthread_mutex_lock(&g_num_channels_lock); + pthread_mutex_lock(&g_configuration_lock); assert(chan->dev->num_channels > 0); chan->dev->num_channels--; - pthread_mutex_unlock(&g_num_channels_lock); - spdk_idxd_reconfigure_chan(chan->chan, 0); + pthread_mutex_unlock(&g_configuration_lock); + spdk_poller_unregister(&chan->poller); spdk_idxd_put_channel(chan->chan); @@ -605,10 +745,6 @@ probe_cb(void *cb_ctx, struct spdk_pci_device *pci_dev) return false; } -#ifdef DEVELOPER_DEBUG_MODE - spdk_idxd_set_config(0); -#endif - return true; } diff --git a/module/accel/ioat/accel_engine_ioat.c b/module/accel/ioat/accel_engine_ioat.c index a485878b7bf..bd37990cc08 100644 --- a/module/accel/ioat/accel_engine_ioat.c +++ b/module/accel/ioat/accel_engine_ioat.c @@ -43,7 +43,36 @@ #include "spdk/event.h" #include "spdk/thread.h" #include "spdk/ioat.h" +#include "spdk/crc32.h" +#define ALIGN_4K 0x1000 + +enum ioat_accel_opcode { + IOAT_ACCEL_OPCODE_MEMMOVE = 0, + IOAT_ACCEL_OPCODE_MEMFILL = 1, + IOAT_ACCEL_OPCODE_COMPARE = 2, + IOAT_ACCEL_OPCODE_CRC32C = 3, + IOAT_ACCEL_OPCODE_DUALCAST = 4, +}; + +struct ioat_accel_op { + struct ioat_io_channel *ioat_ch; + void *cb_arg; + spdk_accel_completion_cb cb_fn; + void *src; + union { + void *dst; + void *src2; + }; + void *dst2; + uint32_t seed; + uint64_t fill_pattern; + enum ioat_accel_opcode op_code; + uint64_t nbytes; + TAILQ_ENTRY(ioat_accel_op) link; +}; + +static int g_batch_size; static bool g_ioat_enable = false; static bool g_ioat_initialized = false; @@ -71,11 +100,13 @@ static pthread_mutex_t g_ioat_mutex = PTHREAD_MUTEX_INITIALIZER; static TAILQ_HEAD(, pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices); - struct ioat_io_channel { - struct spdk_ioat_chan *ioat_ch; - struct ioat_device *ioat_dev; - struct spdk_poller *poller; + struct spdk_ioat_chan *ioat_ch; + struct ioat_device *ioat_dev; + struct spdk_poller *poller; + TAILQ_HEAD(, ioat_accel_op) op_pool; + TAILQ_HEAD(, ioat_accel_op) sw_batch; /* for operations not hw accelerated */ + bool hw_batch; /* for operations that are hw accelerated */ }; static int @@ -140,33 +171,33 @@ SPDK_ACCEL_MODULE_REGISTER(accel_engine_ioat_init, accel_engine_ioat_exit, static void ioat_done(void *cb_arg) { - struct spdk_accel_task *accel_req; + struct spdk_accel_task *accel_task; struct ioat_task *ioat_task = cb_arg; - accel_req = (struct spdk_accel_task *) - ((uintptr_t)ioat_task - - offsetof(struct spdk_accel_task, offload_ctx)); + accel_task = (struct spdk_accel_task *) + ((uintptr_t)ioat_task - + offsetof(struct spdk_accel_task, offload_ctx)); - ioat_task->cb(accel_req, 0); + ioat_task->cb(accel_task, 0); } static int -ioat_submit_copy(void *cb_arg, struct spdk_io_channel *ch, void *dst, void *src, uint64_t nbytes, - spdk_accel_completion_cb cb) +ioat_submit_copy(struct spdk_io_channel *ch, void *dst, void *src, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) { struct ioat_task *ioat_task = (struct ioat_task *)cb_arg; struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch); assert(ioat_ch->ioat_ch != NULL); - ioat_task->cb = cb; + ioat_task->cb = cb_fn; return spdk_ioat_submit_copy(ioat_ch->ioat_ch, ioat_task, ioat_done, dst, src, nbytes); } static int -ioat_submit_fill(void *cb_arg, struct spdk_io_channel *ch, void *dst, uint8_t fill, - uint64_t nbytes, spdk_accel_completion_cb cb) +ioat_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) { struct ioat_task *ioat_task = (struct ioat_task *)cb_arg; struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch); @@ -174,7 +205,7 @@ ioat_submit_fill(void *cb_arg, struct spdk_io_channel *ch, void *dst, uint8_t fi assert(ioat_ch->ioat_ch != NULL); - ioat_task->cb = cb; + ioat_task->cb = cb_fn; return spdk_ioat_submit_fill(ioat_ch->ioat_ch, ioat_task, ioat_done, dst, fill64, nbytes); } @@ -184,27 +215,271 @@ ioat_poll(void *arg) { struct spdk_ioat_chan *chan = arg; - spdk_ioat_process_events(chan); - - return -1; + return spdk_ioat_process_events(chan) != 0 ? SPDK_POLLER_BUSY : + SPDK_POLLER_IDLE; } static struct spdk_io_channel *ioat_get_io_channel(void); /* - * The IOAT engine has more capabilities than this but these are - * the only ones we expose via the accel engine. + * The IOAT engine only supports these capabilities as hardware + * accelerated. The accel fw will handle unsupported functions + * by calling the software implementations of the functions. */ static uint64_t ioat_get_capabilities(void) { - return ACCEL_COPY | ACCEL_FILL; + return ACCEL_COPY | ACCEL_FILL | ACCEL_BATCH; +} + +/* The IOAT batch functions exposed by the accel fw do not match up 1:1 + * with the functions in the IOAT library. The IOAT library directly only + * supports construction of accelerated functions via the IOAT native + * interface. The accel_fw batch capabilities are implemented here in the + * plug-in and rely on either the IOAT library for accelerated commands + * or software functions for non-accelerated. + */ +static uint32_t +ioat_batch_get_max(void) +{ + return g_batch_size; +} + +static struct spdk_accel_batch * +ioat_batch_create(struct spdk_io_channel *ch) +{ + struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch); + + if (!TAILQ_EMPTY(&ioat_ch->sw_batch) || (ioat_ch->hw_batch == true)) { + SPDK_ERRLOG("IOAT accel engine only supports one batch at a time.\n"); + return NULL; + } + + return (struct spdk_accel_batch *)&ioat_ch->hw_batch; +} + +static struct ioat_accel_op * +_prep_op(struct ioat_io_channel *ioat_ch, struct spdk_accel_batch *batch, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct ioat_accel_op *op; + + if ((struct spdk_accel_batch *)&ioat_ch->hw_batch != batch) { + SPDK_ERRLOG("Invalid batch\n"); + return NULL; + } + + if (!TAILQ_EMPTY(&ioat_ch->op_pool)) { + op = TAILQ_FIRST(&ioat_ch->op_pool); + TAILQ_REMOVE(&ioat_ch->op_pool, op, link); + } else { + SPDK_ERRLOG("Ran out of operations for batch\n"); + return NULL; + } + + op->cb_arg = cb_arg; + op->cb_fn = cb_fn; + op->ioat_ch = ioat_ch; + + return op; +} + +static int +ioat_batch_prep_copy(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *dst, void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch); + struct ioat_task *ioat_task = (struct ioat_task *)cb_arg; + + ioat_task->cb = cb_fn; + ioat_ch->hw_batch = true; + + /* Call the IOAT library prep function. */ + return spdk_ioat_build_copy(ioat_ch->ioat_ch, ioat_task, ioat_done, dst, src, nbytes); +} + +static int +ioat_batch_prep_fill(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst, + uint8_t fill, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch); + struct ioat_task *ioat_task = (struct ioat_task *)cb_arg; + uint64_t fill_pattern; + + ioat_task->cb = cb_fn; + ioat_ch->hw_batch = true; + memset(&fill_pattern, fill, sizeof(uint64_t)); + + /* Call the IOAT library prep function. */ + return spdk_ioat_build_fill(ioat_ch->ioat_ch, ioat_task, ioat_done, dst, fill_pattern, nbytes); +} + +static int +ioat_batch_prep_dualcast(struct spdk_io_channel *ch, + struct spdk_accel_batch *batch, void *dst1, void *dst2, + void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct ioat_accel_op *op; + struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch); + + if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) { + SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n"); + return -EINVAL; + } + + op = _prep_op(ioat_ch, batch, cb_fn, cb_arg); + if (op == NULL) { + return -EINVAL; + } + + /* Command specific. */ + op->src = src; + op->dst = dst1; + op->dst2 = dst2; + op->nbytes = nbytes; + op->op_code = IOAT_ACCEL_OPCODE_DUALCAST; + TAILQ_INSERT_TAIL(&ioat_ch->sw_batch, op, link); + + return 0; +} + +static int +ioat_batch_prep_compare(struct spdk_io_channel *ch, + struct spdk_accel_batch *batch, void *src1, + void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct ioat_accel_op *op; + struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch); + + op = _prep_op(ioat_ch, batch, cb_fn, cb_arg); + if (op == NULL) { + return -EINVAL; + } + + /* Command specific. */ + op->src = src1; + op->src2 = src2; + op->nbytes = nbytes; + op->op_code = IOAT_ACCEL_OPCODE_COMPARE; + TAILQ_INSERT_TAIL(&ioat_ch->sw_batch, op, link); + + return 0; +} + +static int +ioat_batch_prep_crc32c(struct spdk_io_channel *ch, + struct spdk_accel_batch *batch, uint32_t *dst, void *src, + uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct ioat_accel_op *op; + struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch); + + op = _prep_op(ioat_ch, batch, cb_fn, cb_arg); + if (op == NULL) { + return -EINVAL; + } + + /* Command specific. */ + op->dst = (void *)dst; + op->src = src; + op->seed = seed; + op->nbytes = nbytes; + op->op_code = IOAT_ACCEL_OPCODE_CRC32C; + TAILQ_INSERT_TAIL(&ioat_ch->sw_batch, op, link); + + return 0; +} + +static int +ioat_batch_cancel(struct spdk_io_channel *ch, struct spdk_accel_batch *batch) +{ + struct ioat_accel_op *op; + struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch); + + if ((struct spdk_accel_batch *)&ioat_ch->hw_batch != batch) { + SPDK_ERRLOG("Invalid batch\n"); + return -EINVAL; + } + + /* Flush the batched HW items, there's no way to cancel these without resetting. */ + spdk_ioat_flush(ioat_ch->ioat_ch); + ioat_ch->hw_batch = false; + + /* Return batched software items to the pool. */ + while ((op = TAILQ_FIRST(&ioat_ch->sw_batch))) { + TAILQ_REMOVE(&ioat_ch->sw_batch, op, link); + TAILQ_INSERT_TAIL(&ioat_ch->op_pool, op, link); + } + + return 0; +} + +static int +ioat_batch_submit(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct ioat_accel_op *op; + struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_task; + int batch_status = 0, cmd_status = 0; + + if ((struct spdk_accel_batch *)&ioat_ch->hw_batch != batch) { + SPDK_ERRLOG("Invalid batch\n"); + return -EINVAL; + } + + /* Flush the batched HW items first. */ + spdk_ioat_flush(ioat_ch->ioat_ch); + ioat_ch->hw_batch = false; + + /* Complete the batched software items. */ + while ((op = TAILQ_FIRST(&ioat_ch->sw_batch))) { + TAILQ_REMOVE(&ioat_ch->sw_batch, op, link); + accel_task = (struct spdk_accel_task *)((uintptr_t)op->cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + + switch (op->op_code) { + case IOAT_ACCEL_OPCODE_DUALCAST: + memcpy(op->dst, op->src, op->nbytes); + memcpy(op->dst2, op->src, op->nbytes); + break; + case IOAT_ACCEL_OPCODE_COMPARE: + cmd_status = memcmp(op->src, op->src2, op->nbytes); + break; + case IOAT_ACCEL_OPCODE_CRC32C: + *(uint32_t *)op->dst = spdk_crc32c_update(op->src, op->nbytes, ~op->seed); + break; + default: + assert(false); + break; + } + + batch_status |= cmd_status; + op->cb_fn(accel_task, cmd_status); + TAILQ_INSERT_TAIL(&ioat_ch->op_pool, op, link); + } + + /* Now complete the batch request itself. */ + accel_task = (struct spdk_accel_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + cb_fn(accel_task, batch_status); + + return 0; } static struct spdk_accel_engine ioat_accel_engine = { .get_capabilities = ioat_get_capabilities, .copy = ioat_submit_copy, .fill = ioat_submit_fill, + .batch_get_max = ioat_batch_get_max, + .batch_create = ioat_batch_create, + .batch_cancel = ioat_batch_cancel, + .batch_prep_copy = ioat_batch_prep_copy, + .batch_prep_dualcast = ioat_batch_prep_dualcast, + .batch_prep_compare = ioat_batch_prep_compare, + .batch_prep_fill = ioat_batch_prep_fill, + .batch_prep_crc32c = ioat_batch_prep_crc32c, + .batch_submit = ioat_batch_submit, .get_io_channel = ioat_get_io_channel, }; @@ -213,12 +488,32 @@ ioat_create_cb(void *io_device, void *ctx_buf) { struct ioat_io_channel *ch = ctx_buf; struct ioat_device *ioat_dev; + struct ioat_accel_op *op; + int i; ioat_dev = ioat_allocate_device(); if (ioat_dev == NULL) { return -1; } + TAILQ_INIT(&ch->sw_batch); + ch->hw_batch = false; + TAILQ_INIT(&ch->op_pool); + + g_batch_size = spdk_ioat_get_max_descriptors(ioat_dev->ioat); + for (i = 0 ; i < g_batch_size ; i++) { + op = calloc(1, sizeof(struct ioat_accel_op)); + if (op == NULL) { + SPDK_ERRLOG("Failed to allocate operation for batch.\n"); + while ((op = TAILQ_FIRST(&ch->op_pool))) { + TAILQ_REMOVE(&ch->op_pool, op, link); + free(op); + } + return -ENOMEM; + } + TAILQ_INSERT_TAIL(&ch->op_pool, op, link); + } + ch->ioat_dev = ioat_dev; ch->ioat_ch = ioat_dev->ioat; ch->poller = SPDK_POLLER_REGISTER(ioat_poll, ch->ioat_ch, 0); @@ -229,6 +524,12 @@ static void ioat_destroy_cb(void *io_device, void *ctx_buf) { struct ioat_io_channel *ch = ctx_buf; + struct ioat_accel_op *op; + + while ((op = TAILQ_FIRST(&ch->op_pool))) { + TAILQ_REMOVE(&ch->op_pool, op, link); + free(op); + } ioat_free_device(ch->ioat_dev); spdk_poller_unregister(&ch->poller); diff --git a/module/bdev/aio/bdev_aio.c b/module/bdev/aio/bdev_aio.c index 5eb63f69e0a..46cf5f69b3a 100644 --- a/module/bdev/aio/bdev_aio.c +++ b/module/bdev/aio/bdev_aio.c @@ -53,12 +53,14 @@ struct bdev_aio_io_channel { uint64_t io_inflight; + io_context_t io_ctx; struct bdev_aio_group_channel *group_ch; + TAILQ_ENTRY(bdev_aio_io_channel) link; }; struct bdev_aio_group_channel { struct spdk_poller *poller; - io_context_t io_ctx; + TAILQ_HEAD(, bdev_aio_io_channel) io_ch_head; }; struct bdev_aio_task { @@ -178,7 +180,7 @@ bdev_aio_readv(struct file_disk *fdisk, struct spdk_io_channel *ch, SPDK_DEBUGLOG(SPDK_LOG_AIO, "read %d iovs size %lu to off: %#lx\n", iovcnt, nbytes, offset); - rc = io_submit(aio_ch->group_ch->io_ctx, 1, &iocb); + rc = io_submit(aio_ch->io_ctx, 1, &iocb); if (rc < 0) { if (rc == -EAGAIN) { spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM); @@ -209,7 +211,7 @@ bdev_aio_writev(struct file_disk *fdisk, struct spdk_io_channel *ch, SPDK_DEBUGLOG(SPDK_LOG_AIO, "write %d iovs size %lu from off: %#lx\n", iovcnt, len, offset); - rc = io_submit(aio_ch->group_ch->io_ctx, 1, &iocb); + rc = io_submit(aio_ch->io_ctx, 1, &iocb); if (rc < 0) { if (rc == -EAGAIN) { spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM); @@ -312,18 +314,17 @@ bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *u } static int -bdev_aio_group_poll(void *arg) +bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch) { - struct bdev_aio_group_channel *group_ch = arg; int nr, i = 0; enum spdk_bdev_io_status status; struct bdev_aio_task *aio_task; struct io_event events[SPDK_AIO_QUEUE_DEPTH]; - nr = bdev_user_io_getevents(group_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events); + nr = bdev_user_io_getevents(io_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events); if (nr < 0) { - return -1; + return 0; } for (i = 0; i < nr; i++) { @@ -341,6 +342,20 @@ bdev_aio_group_poll(void *arg) return nr; } +static int +bdev_aio_group_poll(void *arg) +{ + struct bdev_aio_group_channel *group_ch = arg; + struct bdev_aio_io_channel *io_ch; + int nr = 0; + + TAILQ_FOREACH(io_ch, &group_ch->io_ch_head, link) { + nr += bdev_aio_io_channel_poll(io_ch); + } + + return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; +} + static void _bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i) { @@ -384,7 +399,7 @@ bdev_aio_reset_retry_timer(void *arg) fdisk, _bdev_aio_get_io_inflight_done); - return -1; + return SPDK_POLLER_BUSY; } static void @@ -481,7 +496,13 @@ bdev_aio_create_cb(void *io_device, void *ctx_buf) { struct bdev_aio_io_channel *ch = ctx_buf; + if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) { + SPDK_ERRLOG("async I/O context setup failure\n"); + return -1; + } + ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if)); + TAILQ_INSERT_TAIL(&ch->group_ch->io_ch_head, ch, link); return 0; } @@ -491,6 +512,11 @@ bdev_aio_destroy_cb(void *io_device, void *ctx_buf) { struct bdev_aio_io_channel *ch = ctx_buf; + io_destroy(ch->io_ctx); + + assert(ch->group_ch); + TAILQ_REMOVE(&ch->group_ch->io_ch_head, ch, link); + spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); } @@ -561,10 +587,7 @@ bdev_aio_group_create_cb(void *io_device, void *ctx_buf) { struct bdev_aio_group_channel *ch = ctx_buf; - if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) { - SPDK_ERRLOG("async I/O context setup failure\n"); - return -1; - } + TAILQ_INIT(&ch->io_ch_head); ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0); return 0; @@ -575,7 +598,9 @@ bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf) { struct bdev_aio_group_channel *ch = ctx_buf; - io_destroy(ch->io_ctx); + if (!TAILQ_EMPTY(&ch->io_ch_head)) { + SPDK_ERRLOG("Group channel of bdev aio has uncleared io channel\n"); + } spdk_poller_unregister(&ch->poller); } @@ -656,7 +681,11 @@ create_aio_bdev(const char *name, const char *filename, uint32_t block_size) } fdisk->disk.blocklen = block_size; - fdisk->disk.required_alignment = spdk_u32log2(block_size); + if (fdisk->block_size_override && detected_block_size) { + fdisk->disk.required_alignment = spdk_u32log2(detected_block_size); + } else { + fdisk->disk.required_alignment = spdk_u32log2(block_size); + } if (disk_size % fdisk->disk.blocklen != 0) { SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", diff --git a/module/bdev/compress/vbdev_compress.c b/module/bdev/compress/vbdev_compress.c index 336f0484cac..a83c97c6407 100644 --- a/module/bdev/compress/vbdev_compress.c +++ b/module/bdev/compress/vbdev_compress.c @@ -188,7 +188,7 @@ static struct rte_comp_xform g_decomp_xform = { static void vbdev_compress_examine(struct spdk_bdev *bdev); static void vbdev_compress_claim(struct vbdev_compress *comp_bdev); static void vbdev_compress_queue_io(struct spdk_bdev_io *bdev_io); -struct vbdev_compress *_prepare_for_load_init(struct spdk_bdev *bdev); +struct vbdev_compress *_prepare_for_load_init(struct spdk_bdev *bdev, uint32_t lb_size); static void vbdev_compress_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io); static void comp_bdev_ch_destroy_cb(void *io_device, void *ctx_buf); static void vbdev_compress_delete_done(void *cb_arg, int bdeverrno); @@ -702,7 +702,7 @@ comp_dev_poller(void *args) } } } - return 0; + return num_deq == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; } /* Entry point for reduce lib to issue a compress operation. */ @@ -1284,7 +1284,7 @@ vbdev_compress_base_bdev_hotremove_cb(void *ctx) * information for reducelib to init or load. */ struct vbdev_compress * -_prepare_for_load_init(struct spdk_bdev *bdev) +_prepare_for_load_init(struct spdk_bdev *bdev, uint32_t lb_size) { struct vbdev_compress *meta_ctx; @@ -1306,7 +1306,12 @@ _prepare_for_load_init(struct spdk_bdev *bdev) meta_ctx->backing_dev.blockcnt = bdev->blockcnt; meta_ctx->params.chunk_size = CHUNK_SIZE; - meta_ctx->params.logical_block_size = bdev->blocklen; + if (lb_size == 0) { + meta_ctx->params.logical_block_size = bdev->blocklen; + } else { + meta_ctx->params.logical_block_size = lb_size; + } + meta_ctx->params.backing_io_unit_size = BACKING_IO_SZ; return meta_ctx; } @@ -1334,12 +1339,12 @@ _set_pmd(struct vbdev_compress *comp_dev) /* Call reducelib to initialize a new volume */ static int -vbdev_init_reduce(struct spdk_bdev *bdev, const char *pm_path) +vbdev_init_reduce(struct spdk_bdev *bdev, const char *pm_path, uint32_t lb_size) { struct vbdev_compress *meta_ctx; int rc; - meta_ctx = _prepare_for_load_init(bdev); + meta_ctx = _prepare_for_load_init(bdev, lb_size); if (meta_ctx == NULL) { return -EINVAL; } @@ -1471,7 +1476,7 @@ comp_bdev_ch_destroy_cb(void *io_device, void *ctx_buf) /* RPC entry point for compression vbdev creation. */ int -create_compress_bdev(const char *bdev_name, const char *pm_path) +create_compress_bdev(const char *bdev_name, const char *pm_path, uint32_t lb_size) { struct spdk_bdev *bdev; @@ -1480,7 +1485,12 @@ create_compress_bdev(const char *bdev_name, const char *pm_path) return -ENODEV; } - return vbdev_init_reduce(bdev, pm_path);; + if ((lb_size != 0) && (lb_size != LB_SIZE_4K) && (lb_size != LB_SIZE_512B)) { + SPDK_ERRLOG("Logical block size must be 512 or 4096\n"); + return -EINVAL; + } + + return vbdev_init_reduce(bdev, pm_path, lb_size); } /* On init, just init the compress drivers. All metadata is stored on disk. */ @@ -1595,7 +1605,7 @@ vbdev_compress_claim(struct vbdev_compress *comp_bdev) comp_bdev->comp_bdev.split_on_optimal_io_boundary = true; - comp_bdev->comp_bdev.blocklen = comp_bdev->base_bdev->blocklen; + comp_bdev->comp_bdev.blocklen = comp_bdev->params.logical_block_size; comp_bdev->comp_bdev.blockcnt = comp_bdev->params.vol_size / comp_bdev->comp_bdev.blocklen; assert(comp_bdev->comp_bdev.blockcnt > 0); @@ -1822,7 +1832,7 @@ vbdev_compress_examine(struct spdk_bdev *bdev) return; } - meta_ctx = _prepare_for_load_init(bdev); + meta_ctx = _prepare_for_load_init(bdev, 0); if (meta_ctx == NULL) { spdk_bdev_module_examine_done(&compress_if); return; diff --git a/module/bdev/compress/vbdev_compress.h b/module/bdev/compress/vbdev_compress.h index 1b5061a2d8f..4dcd78f6078 100644 --- a/module/bdev/compress/vbdev_compress.h +++ b/module/bdev/compress/vbdev_compress.h @@ -38,6 +38,9 @@ #include "spdk/bdev.h" +#define LB_SIZE_4K 0x1000UL +#define LB_SIZE_512B 0x200UL + /** * Get the first compression bdev. * @@ -85,9 +88,10 @@ typedef void (*spdk_delete_compress_complete)(void *cb_arg, int bdeverrno); * * \param bdev_name Bdev on which compression bdev will be created. * \param pm_path Path to persistent memory. + * \param lb_size Logical block size for the compressed volume in bytes. Must be 4K or 512. * \return 0 on success, other on failure. */ -int create_compress_bdev(const char *bdev_name, const char *pm_path); +int create_compress_bdev(const char *bdev_name, const char *pm_path, uint32_t lb_size); /** * Delete compress bdev. diff --git a/module/bdev/compress/vbdev_compress_rpc.c b/module/bdev/compress/vbdev_compress_rpc.c index 9bdb25650d3..bd18d45721d 100644 --- a/module/bdev/compress/vbdev_compress_rpc.c +++ b/module/bdev/compress/vbdev_compress_rpc.c @@ -107,8 +107,8 @@ static const struct spdk_json_object_decoder rpc_compress_pmd_decoder[] = { }; static void -rpc_compress_set_pmd(struct spdk_jsonrpc_request *request, - const struct spdk_json_val *params) +rpc_bdev_compress_set_pmd(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) { struct rpc_compress_set_pmd req; struct spdk_json_write_ctx *w; @@ -141,14 +141,16 @@ rpc_compress_set_pmd(struct spdk_jsonrpc_request *request, spdk_jsonrpc_end_result(request, w); } } -SPDK_RPC_REGISTER("compress_set_pmd", rpc_compress_set_pmd, +SPDK_RPC_REGISTER("bdev_compress_set_pmd", rpc_bdev_compress_set_pmd, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) -SPDK_RPC_REGISTER_ALIAS_DEPRECATED(compress_set_pmd, set_compress_pmd) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_compress_set_pmd, set_compress_pmd) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_compress_set_pmd, compress_set_pmd) /* Structure to hold the parameters for this RPC method. */ struct rpc_construct_compress { char *base_bdev_name; char *pm_path; + uint32_t lb_size; }; /* Free the allocated memory resource after the RPC handling. */ @@ -163,6 +165,7 @@ free_rpc_construct_compress(struct rpc_construct_compress *r) static const struct spdk_json_object_decoder rpc_construct_compress_decoders[] = { {"base_bdev_name", offsetof(struct rpc_construct_compress, base_bdev_name), spdk_json_decode_string}, {"pm_path", offsetof(struct rpc_construct_compress, pm_path), spdk_json_decode_string}, + {"lb_size", offsetof(struct rpc_construct_compress, lb_size), spdk_json_decode_uint32}, }; /* Decode the parameters for this RPC method and properly construct the compress @@ -181,12 +184,12 @@ rpc_bdev_compress_create(struct spdk_jsonrpc_request *request, SPDK_COUNTOF(rpc_construct_compress_decoders), &req)) { SPDK_DEBUGLOG(SPDK_LOG_VBDEV_COMPRESS, "spdk_json_decode_object failed\n"); - spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_PARSE_ERROR, "spdk_json_decode_object failed"); goto cleanup; } - rc = create_compress_bdev(req.base_bdev_name, req.pm_path); + rc = create_compress_bdev(req.base_bdev_name, req.pm_path, req.lb_size); if (rc != 0) { spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); goto cleanup; diff --git a/module/bdev/delay/vbdev_delay.c b/module/bdev/delay/vbdev_delay.c index 2b3cd0face1..b4ea1b413a5 100644 --- a/module/bdev/delay/vbdev_delay.c +++ b/module/bdev/delay/vbdev_delay.c @@ -165,16 +165,18 @@ vbdev_delay_destruct(void *ctx) return 0; } -static void +static int _process_io_stailq(void *arg, uint64_t ticks) { STAILQ_HEAD(, delay_bdev_io) *head = arg; struct delay_bdev_io *io_ctx, *tmp; + int completions = 0; STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) { if (io_ctx->completion_tick <= ticks) { STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link); spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), io_ctx->status); + completions++; } else { /* In the general case, I/O will become ready in an fifo order. When timeouts are dynamically * changed, this is not necessarily the case. However, the normal behavior will be restored @@ -186,6 +188,8 @@ _process_io_stailq(void *arg, uint64_t ticks) break; } } + + return completions; } static int @@ -193,13 +197,14 @@ _delay_finish_io(void *arg) { struct delay_io_channel *delay_ch = arg; uint64_t ticks = spdk_get_ticks(); + int completions = 0; - _process_io_stailq(&delay_ch->avg_read_io, ticks); - _process_io_stailq(&delay_ch->avg_write_io, ticks); - _process_io_stailq(&delay_ch->p99_read_io, ticks); - _process_io_stailq(&delay_ch->p99_write_io, ticks); + completions += _process_io_stailq(&delay_ch->avg_read_io, ticks); + completions += _process_io_stailq(&delay_ch->avg_write_io, ticks); + completions += _process_io_stailq(&delay_ch->p99_read_io, ticks); + completions += _process_io_stailq(&delay_ch->p99_write_io, ticks); - return 0; + return completions == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; } /* Completion callback for IO that were issued from this bdev. The original bdev_io diff --git a/module/bdev/gpt/vbdev_gpt.c b/module/bdev/gpt/vbdev_gpt.c index 5232444fb72..3d6083058dd 100644 --- a/module/bdev/gpt/vbdev_gpt.c +++ b/module/bdev/gpt/vbdev_gpt.c @@ -549,8 +549,9 @@ vbdev_gpt_examine(struct spdk_bdev *bdev) } if (spdk_bdev_get_block_size(bdev) % 512 != 0) { - SPDK_ERRLOG("GPT module does not support block size %" PRIu32 " for bdev %s\n", - spdk_bdev_get_block_size(bdev), spdk_bdev_get_name(bdev)); + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, + "GPT module does not support block size %" PRIu32 " for bdev %s\n", + spdk_bdev_get_block_size(bdev), spdk_bdev_get_name(bdev)); spdk_bdev_module_examine_done(&gpt_if); return; } diff --git a/module/bdev/iscsi/bdev_iscsi.c b/module/bdev/iscsi/bdev_iscsi.c index 99792733f80..18e8e009028 100644 --- a/module/bdev/iscsi/bdev_iscsi.c +++ b/module/bdev/iscsi/bdev_iscsi.c @@ -378,27 +378,29 @@ bdev_iscsi_poll_lun(void *_lun) if (poll(&pfd, 1, 0) < 0) { SPDK_ERRLOG("poll failed\n"); - return -1; + return SPDK_POLLER_IDLE; } if (pfd.revents != 0) { if (iscsi_service(lun->context, pfd.revents) < 0) { SPDK_ERRLOG("iscsi_service failed: %s\n", iscsi_get_error(lun->context)); } + + return SPDK_POLLER_BUSY; } - return -1; + return SPDK_POLLER_IDLE; } static int bdev_iscsi_no_master_ch_poll(void *arg) { struct bdev_iscsi_lun *lun = arg; - int rc = 0; + enum spdk_thread_poller_rc rc = SPDK_POLLER_IDLE; if (pthread_mutex_trylock(&lun->mutex)) { /* Don't care about the error code here. */ - return -1; + return SPDK_POLLER_IDLE; } if (lun->ch_count == 0) { @@ -754,6 +756,10 @@ iscsi_bdev_conn_poll(void *arg) struct pollfd pfd; struct iscsi_context *context; + if (TAILQ_EMPTY(&g_iscsi_conn_req)) { + return SPDK_POLLER_IDLE; + } + TAILQ_FOREACH_SAFE(req, &g_iscsi_conn_req, link, tmp) { context = req->context; pfd.fd = iscsi_get_fd(context); @@ -761,7 +767,7 @@ iscsi_bdev_conn_poll(void *arg) pfd.revents = 0; if (poll(&pfd, 1, 0) < 0) { SPDK_ERRLOG("poll failed\n"); - return -1; + return SPDK_POLLER_BUSY; } if (pfd.revents != 0) { @@ -784,7 +790,7 @@ iscsi_bdev_conn_poll(void *arg) _bdev_iscsi_conn_req_free(req); } } - return -1; + return SPDK_POLLER_BUSY; } int diff --git a/module/bdev/lvol/vbdev_lvol_rpc.c b/module/bdev/lvol/vbdev_lvol_rpc.c index b1bdccab732..79e74f6a50f 100644 --- a/module/bdev/lvol/vbdev_lvol_rpc.c +++ b/module/bdev/lvol/vbdev_lvol_rpc.c @@ -1016,13 +1016,11 @@ static void rpc_dump_lvol_store_info(struct spdk_json_write_ctx *w, struct lvol_store_bdev *lvs_bdev) { struct spdk_blob_store *bs; - uint64_t cluster_size, block_size; + uint64_t cluster_size; char uuid[SPDK_UUID_STRING_LEN]; bs = lvs_bdev->lvs->blobstore; cluster_size = spdk_bs_get_cluster_size(bs); - /* Block size of lvols is always size of blob store page */ - block_size = spdk_bs_get_page_size(bs); spdk_json_write_object_begin(w); @@ -1037,7 +1035,7 @@ rpc_dump_lvol_store_info(struct spdk_json_write_ctx *w, struct lvol_store_bdev * spdk_json_write_named_uint64(w, "free_clusters", spdk_bs_free_cluster_count(bs)); - spdk_json_write_named_uint64(w, "block_size", block_size); + spdk_json_write_named_uint64(w, "block_size", spdk_bs_get_io_unit_size(bs)); spdk_json_write_named_uint64(w, "cluster_size", cluster_size); diff --git a/module/bdev/malloc/bdev_malloc.c b/module/bdev/malloc/bdev_malloc.c index 53156dc3a2a..ce040315331 100644 --- a/module/bdev/malloc/bdev_malloc.c +++ b/module/bdev/malloc/bdev_malloc.c @@ -58,22 +58,10 @@ struct malloc_task { enum spdk_bdev_io_status status; }; -static struct malloc_task * -__malloc_task_from_accel_task(struct spdk_accel_task *ct) -{ - return (struct malloc_task *)((uintptr_t)ct - sizeof(struct malloc_task)); -} - -static struct spdk_accel_task * -__accel_task_from_malloc_task(struct malloc_task *mt) -{ - return (struct spdk_accel_task *)((uintptr_t)mt + sizeof(struct malloc_task)); -} - static void malloc_done(void *ref, int status) { - struct malloc_task *task = __malloc_task_from_accel_task(ref); + struct malloc_task *task = (struct malloc_task *)ref; if (status != 0) { if (status == -ENOMEM) { @@ -98,7 +86,7 @@ static void bdev_malloc_get_spdk_running_config(FILE *fp); static int bdev_malloc_get_ctx_size(void) { - return sizeof(struct malloc_task) + spdk_accel_task_size(); + return sizeof(struct malloc_task); } static struct spdk_bdev_module malloc_if = { @@ -171,12 +159,11 @@ bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, task->num_outstanding = iovcnt; for (i = 0; i < iovcnt; i++) { - res = spdk_accel_submit_copy(__accel_task_from_malloc_task(task), - ch, iov[i].iov_base, - src, iov[i].iov_len, malloc_done); + res = spdk_accel_submit_copy(ch, iov[i].iov_base, + src, iov[i].iov_len, malloc_done, task); if (res != 0) { - malloc_done(__accel_task_from_malloc_task(task), res); + malloc_done(task, res); } src += iov[i].iov_len; @@ -206,12 +193,11 @@ bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, task->num_outstanding = iovcnt; for (i = 0; i < iovcnt; i++) { - res = spdk_accel_submit_copy(__accel_task_from_malloc_task(task), - ch, dst, iov[i].iov_base, - iov[i].iov_len, malloc_done); + res = spdk_accel_submit_copy(ch, dst, iov[i].iov_base, + iov[i].iov_len, malloc_done, task); if (res != 0) { - malloc_done(__accel_task_from_malloc_task(task), res); + malloc_done(task, res); } dst += iov[i].iov_len; @@ -228,8 +214,8 @@ bdev_malloc_unmap(struct malloc_disk *mdisk, task->status = SPDK_BDEV_IO_STATUS_SUCCESS; task->num_outstanding = 1; - return spdk_accel_submit_fill(__accel_task_from_malloc_task(task), ch, - mdisk->malloc_buf + offset, 0, byte_count, malloc_done); + return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0, + byte_count, malloc_done, task); } static int64_t @@ -481,6 +467,8 @@ static int bdev_malloc_initialize(void) uint64_t size; struct spdk_bdev *bdev; + malloc_disk_count = 0; + if (sp != NULL) { NumberOfLuns = spdk_conf_section_get_intval(sp, "NumberOfLuns"); LunSizeInMB = spdk_conf_section_get_intval(sp, "LunSizeInMB"); diff --git a/module/bdev/null/bdev_null.c b/module/bdev/null/bdev_null.c index ee9480c50b3..97aa8b03f3f 100644 --- a/module/bdev/null/bdev_null.c +++ b/module/bdev/null/bdev_null.c @@ -368,7 +368,7 @@ null_io_poll(void *arg) TAILQ_SWAP(&ch->io, &io, spdk_bdev_io, module_link); if (TAILQ_EMPTY(&io)) { - return 0; + return SPDK_POLLER_IDLE; } while (!TAILQ_EMPTY(&io)) { @@ -377,7 +377,7 @@ null_io_poll(void *arg) spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); } - return 1; + return SPDK_POLLER_BUSY; } static int diff --git a/module/bdev/null/bdev_null_rpc.c b/module/bdev/null/bdev_null_rpc.c index f3a433e755c..08bdb28975f 100644 --- a/module/bdev/null/bdev_null_rpc.c +++ b/module/bdev/null/bdev_null_rpc.c @@ -120,6 +120,12 @@ rpc_bdev_null_create(struct spdk_jsonrpc_request *request, goto cleanup; } + if (req.dif_type != SPDK_DIF_DISABLE && !req.md_size) { + spdk_jsonrpc_send_error_response(request, -EINVAL, + "Interleaved metadata size should be set for DIF"); + goto cleanup; + } + opts.name = req.name; opts.uuid = uuid; opts.num_blocks = req.num_blocks; diff --git a/module/bdev/nvme/bdev_nvme.c b/module/bdev/nvme/bdev_nvme.c index d5c346dbf41..8f8da2219d5 100644 --- a/module/bdev/nvme/bdev_nvme.c +++ b/module/bdev/nvme/bdev_nvme.c @@ -162,7 +162,10 @@ static int bdev_nvme_io_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel static int bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); -static int bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio); +static int bdev_nvme_abort(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); +static int bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio, + bool failover); typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); @@ -257,25 +260,23 @@ bdev_nvme_poll(void *arg) } } - return num_completions; + return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; } static int bdev_nvme_poll_adminq(void *arg) { int32_t rc; - struct spdk_nvme_ctrlr *ctrlr = arg; - struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; - rc = spdk_nvme_ctrlr_process_admin_completions(ctrlr); + assert(nvme_bdev_ctrlr != NULL); + rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr); if (rc < 0) { - nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr)); - assert(nvme_bdev_ctrlr != NULL); - bdev_nvme_reset(nvme_bdev_ctrlr, NULL); + bdev_nvme_reset(nvme_bdev_ctrlr, NULL, true); } - return rc; + return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; } static int @@ -338,6 +339,7 @@ _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc) pthread_mutex_lock(&g_bdev_nvme_mutex); nvme_bdev_ctrlr->resetting = false; + nvme_bdev_ctrlr->failover_in_progress = false; pthread_mutex_unlock(&g_bdev_nvme_mutex); /* Make sure we clear any pending resets before returning. */ spdk_for_each_channel(nvme_bdev_ctrlr, @@ -447,10 +449,12 @@ _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) } static int -bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio) +bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio, bool failover) { struct spdk_io_channel *ch; struct nvme_io_channel *nvme_ch; + struct nvme_bdev_ctrlr_trid *next_trid = NULL, *tmp_trid = NULL; + int rc = 0; pthread_mutex_lock(&g_bdev_nvme_mutex); if (nvme_bdev_ctrlr->destruct) { @@ -462,9 +466,25 @@ bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bi return 0; } + if (failover) { + tmp_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); + assert(tmp_trid); + assert(&tmp_trid->trid == nvme_bdev_ctrlr->connected_trid); + next_trid = TAILQ_NEXT(tmp_trid, link); + if (!next_trid) { + failover = false; + } + } + if (!nvme_bdev_ctrlr->resetting) { nvme_bdev_ctrlr->resetting = true; + if (failover) { + nvme_bdev_ctrlr->failover_in_progress = true; + } } else { + if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) { + rc = -EAGAIN; + } pthread_mutex_unlock(&g_bdev_nvme_mutex); SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); /* @@ -479,7 +499,19 @@ bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bi TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, spdk_bdev_io_from_ctx(bio), module_link); spdk_put_io_channel(ch); } - return 0; + return rc; + } + + if (failover) { + spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr); + nvme_bdev_ctrlr->connected_trid = &next_trid->trid; + rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid); + assert(rc == 0); + /** Shuffle the old trid to the end of the list and use the new one. + * Allows for round robin through multiple connections. + */ + TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, tmp_trid, link); + TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, tmp_trid, link); } pthread_mutex_unlock(&g_bdev_nvme_mutex); @@ -533,6 +565,7 @@ _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; + struct nvme_bdev_io *nbdev_io_to_abort; if (nvme_ch->qpair == NULL) { /* The device is currently resetting */ @@ -592,7 +625,7 @@ _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_ bdev_io->u.bdev.num_blocks); case SPDK_BDEV_IO_TYPE_RESET: - return bdev_nvme_reset(nbdev->nvme_bdev_ctrlr, nbdev_io); + return bdev_nvme_reset(nbdev->nvme_bdev_ctrlr, nbdev_io, false); case SPDK_BDEV_IO_TYPE_FLUSH: return bdev_nvme_flush(nbdev, @@ -626,6 +659,13 @@ _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_ bdev_io->u.nvme_passthru.md_buf, bdev_io->u.nvme_passthru.md_len); + case SPDK_BDEV_IO_TYPE_ABORT: + nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; + return bdev_nvme_abort(nbdev, + ch, + nbdev_io, + nbdev_io_to_abort); + default: return -EINVAL; } @@ -659,6 +699,7 @@ bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) case SPDK_BDEV_IO_TYPE_FLUSH: case SPDK_BDEV_IO_TYPE_NVME_ADMIN: case SPDK_BDEV_IO_TYPE_NVME_IO: + case SPDK_BDEV_IO_TYPE_ABORT: return true; case SPDK_BDEV_IO_TYPE_COMPARE: @@ -793,7 +834,7 @@ bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf) return -1; } - group->poller = spdk_poller_register(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); + group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); if (group->poller == NULL) { spdk_nvme_poll_group_destroy(group->group); @@ -841,13 +882,13 @@ bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) spdk_json_write_named_object_begin(w, "nvme"); - if (nvme_bdev_ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { - spdk_json_write_named_string(w, "pci_address", nvme_bdev_ctrlr->trid.traddr); + if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { + spdk_json_write_named_string(w, "pci_address", nvme_bdev_ctrlr->connected_trid->traddr); } spdk_json_write_named_object_begin(w, "trid"); - nvme_bdev_dump_trid_json(&nvme_bdev_ctrlr->trid, w); + nvme_bdev_dump_trid_json(nvme_bdev_ctrlr->connected_trid, w); spdk_json_write_object_end(w); @@ -1129,7 +1170,7 @@ nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) SPDK_WARNLOG("Abort failed. Resetting controller.\n"); nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr)); assert(nvme_bdev_ctrlr != NULL); - bdev_nvme_reset(nvme_bdev_ctrlr, NULL); + bdev_nvme_reset(nvme_bdev_ctrlr, NULL, false); } } @@ -1148,7 +1189,7 @@ timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, SPDK_ERRLOG("Controller Fatal Status, reset required\n"); nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr)); assert(nvme_bdev_ctrlr != NULL); - bdev_nvme_reset(nvme_bdev_ctrlr, NULL); + bdev_nvme_reset(nvme_bdev_ctrlr, NULL, false); return; } @@ -1168,7 +1209,7 @@ timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr)); assert(nvme_bdev_ctrlr != NULL); - bdev_nvme_reset(nvme_bdev_ctrlr, NULL); + bdev_nvme_reset(nvme_bdev_ctrlr, NULL, false); break; case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "No action for nvme controller timeout.\n"); @@ -1350,6 +1391,7 @@ create_ctrlr(struct spdk_nvme_ctrlr *ctrlr, uint32_t prchk_flags) { struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct nvme_bdev_ctrlr_trid *trid_entry; uint32_t i; int rc; @@ -1358,6 +1400,8 @@ create_ctrlr(struct spdk_nvme_ctrlr *ctrlr, SPDK_ERRLOG("Failed to allocate device struct\n"); return -ENOMEM; } + + TAILQ_INIT(&nvme_bdev_ctrlr->trids); nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *)); if (!nvme_bdev_ctrlr->namespaces) { @@ -1366,6 +1410,16 @@ create_ctrlr(struct spdk_nvme_ctrlr *ctrlr, return -ENOMEM; } + trid_entry = calloc(1, sizeof(*trid_entry)); + if (trid_entry == NULL) { + SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); + free(nvme_bdev_ctrlr->namespaces); + free(nvme_bdev_ctrlr); + return -ENOMEM; + } + + trid_entry->trid = *trid; + for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns)); if (nvme_bdev_ctrlr->namespaces[i] == NULL) { @@ -1373,18 +1427,21 @@ create_ctrlr(struct spdk_nvme_ctrlr *ctrlr, for (; i > 0; i--) { free(nvme_bdev_ctrlr->namespaces[i - 1]); } + free(trid_entry); free(nvme_bdev_ctrlr->namespaces); free(nvme_bdev_ctrlr); return -ENOMEM; } } + nvme_bdev_ctrlr->thread = spdk_get_thread(); nvme_bdev_ctrlr->adminq_timer_poller = NULL; nvme_bdev_ctrlr->ctrlr = ctrlr; nvme_bdev_ctrlr->ref = 0; - nvme_bdev_ctrlr->trid = *trid; + nvme_bdev_ctrlr->connected_trid = &trid_entry->trid; nvme_bdev_ctrlr->name = strdup(name); if (nvme_bdev_ctrlr->name == NULL) { + free(trid_entry); free(nvme_bdev_ctrlr->namespaces); free(nvme_bdev_ctrlr); return -ENOMEM; @@ -1394,6 +1451,7 @@ create_ctrlr(struct spdk_nvme_ctrlr *ctrlr, rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr); if (spdk_unlikely(rc != 0)) { SPDK_ERRLOG("Unable to initialize OCSSD controller\n"); + free(trid_entry); free(nvme_bdev_ctrlr->name); free(nvme_bdev_ctrlr->namespaces); free(nvme_bdev_ctrlr); @@ -1407,7 +1465,7 @@ create_ctrlr(struct spdk_nvme_ctrlr *ctrlr, sizeof(struct nvme_io_channel), name); - nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, ctrlr, + nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr, g_opts.nvme_adminq_poll_period_us); TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq); @@ -1426,6 +1484,8 @@ create_ctrlr(struct spdk_nvme_ctrlr *ctrlr, SPDK_ERRLOG("Failed to initialize Opal\n"); } } + + TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link); return 0; } @@ -1525,17 +1585,16 @@ bdev_nvme_hotplug(void *arg) hotplug_probe_cb, attach_cb, remove_cb); if (!g_hotplug_probe_ctx) { - return -1; + return SPDK_POLLER_BUSY; } } done = spdk_nvme_probe_poll_async(g_hotplug_probe_ctx); if (done != -EAGAIN) { g_hotplug_probe_ctx = NULL; - return 1; } - return -1; + return SPDK_POLLER_BUSY; } void @@ -1627,7 +1686,7 @@ nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx) uint32_t i, nsid; size_t j; - nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid); + nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name); assert(nvme_bdev_ctrlr != NULL); /* @@ -1696,7 +1755,132 @@ bdev_nvme_async_poll(void *arg) free(ctx); } - return 1; + return SPDK_POLLER_BUSY; +} + +int +bdev_nvme_add_trid(const char *name, struct spdk_nvme_transport_id *trid) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct spdk_nvme_ctrlr *new_ctrlr; + struct spdk_nvme_ctrlr_opts opts; + uint32_t i; + struct spdk_nvme_ns *ns, *new_ns; + const struct spdk_nvme_ns_data *ns_data, *new_ns_data; + struct nvme_bdev_ctrlr_trid *new_trid; + int rc = 0; + + assert(name != NULL); + + nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); + if (nvme_bdev_ctrlr == NULL) { + SPDK_ERRLOG("Failed to find NVMe controller\n"); + return -ENODEV; + } + + /* Currently we only support failover to the same transport type. */ + if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) { + return -EINVAL; + } + + /* Currently we only support failover to the same NQN. */ + if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) { + return -EINVAL; + } + + /* Skip all the other checks if we've already registered this path. */ + TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) { + if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) { + return -EEXIST; + } + } + + spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts)); + opts.transport_retry_count = g_opts.retry_count; + + new_ctrlr = spdk_nvme_connect(trid, &opts, sizeof(opts)); + + if (new_ctrlr == NULL) { + return -ENODEV; + } + + if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) { + rc = -EINVAL; + goto out; + } + + for (i = 1; i <= nvme_bdev_ctrlr->num_ns; i++) { + ns = spdk_nvme_ctrlr_get_ns(nvme_bdev_ctrlr->ctrlr, i); + new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, i); + assert(ns != NULL); + assert(new_ns != NULL); + + ns_data = spdk_nvme_ns_get_data(ns); + new_ns_data = spdk_nvme_ns_get_data(new_ns); + if (memcmp(ns_data->nguid, new_ns_data->nguid, sizeof(ns_data->nguid))) { + rc = -EINVAL; + goto out; + } + } + + new_trid = calloc(1, sizeof(*new_trid)); + if (new_trid == NULL) { + rc = -ENOMEM; + goto out; + } + new_trid->trid = *trid; + TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link); + +out: + spdk_nvme_detach(new_ctrlr); + return rc; +} + +int +bdev_nvme_remove_trid(const char *name, struct spdk_nvme_transport_id *trid) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct nvme_bdev_ctrlr_trid *ctrlr_trid, *tmp_trid; + + if (name == NULL) { + return -EINVAL; + } + + nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); + if (nvme_bdev_ctrlr == NULL) { + SPDK_ERRLOG("Failed to find NVMe controller\n"); + return -ENODEV; + } + + /* case 1: we are currently using the path to be removed. */ + if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) { + ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); + assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid); + /* case 1A: the current path is the only path. */ + if (!TAILQ_NEXT(ctrlr_trid, link)) { + return bdev_nvme_delete(name); + } + + /* case 1B: there is an alternative path. */ + if (bdev_nvme_reset(nvme_bdev_ctrlr, NULL, true) == -EAGAIN) { + return -EAGAIN; + } + assert(nvme_bdev_ctrlr->connected_trid != &ctrlr_trid->trid); + TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link); + free(ctrlr_trid); + return 0; + } + /* case 2: We are not using the specified path. */ + TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) { + if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) { + TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link); + free(ctrlr_trid); + return 0; + } + } + + /* case 2A: The address isn't even in the registered list. */ + return -ENXIO; } int @@ -1712,17 +1896,26 @@ bdev_nvme_create(struct spdk_nvme_transport_id *trid, { struct nvme_probe_skip_entry *entry, *tmp; struct nvme_async_probe_ctx *ctx; + struct nvme_bdev_ctrlr *existing_ctrlr; + int rc; - if (nvme_bdev_ctrlr_get(trid) != NULL) { + existing_ctrlr = nvme_bdev_ctrlr_get_by_name(base_name); + if (existing_ctrlr) { + if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { + SPDK_ERRLOG("A controller with the provided name (name: %s) already exists with transport type PCIe. PCIe multipath is not supported.\n", + base_name); + return -EEXIST; + } + rc = bdev_nvme_add_trid(existing_ctrlr->name, trid); + if (rc) { + return rc; + } + /* TODO expand this check to include both the host and target TRIDs. Only if both are the same should we fail. */ + } else if (nvme_bdev_ctrlr_get(trid) != NULL) { SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); return -EEXIST; } - if (nvme_bdev_ctrlr_get_by_name(base_name)) { - SPDK_ERRLOG("A controller with the provided name (%s) already exists.\n", base_name); - return -EEXIST; - } - if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { @@ -1745,6 +1938,11 @@ bdev_nvme_create(struct spdk_nvme_transport_id *trid, ctx->prchk_flags = prchk_flags; ctx->trid = *trid; + if (existing_ctrlr) { + nvme_ctrlr_populate_namespaces_done(ctx); + return 0; + } + spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); ctx->opts.transport_retry_count = g_opts.retry_count; @@ -1787,12 +1985,12 @@ bdev_nvme_delete(const char *name) return -ENODEV; } - if (nvme_bdev_ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { entry = calloc(1, sizeof(*entry)); if (!entry) { return -ENOMEM; } - entry->trid = nvme_bdev_ctrlr->trid; + entry->trid = *nvme_bdev_ctrlr->connected_trid; TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); } @@ -2233,6 +2431,28 @@ bdev_nvme_admin_passthru_completion(void *ctx) bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); } +static void +bdev_nvme_abort_completion(void *ctx) +{ + struct nvme_bdev_io *bio = ctx; + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); + + if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_bdev_io *bio = ref; + + bio->cpl = *cpl; + spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); +} + static void bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) { @@ -2617,6 +2837,60 @@ bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); } +static void +bdev_nvme_abort_admin_cmd(void *ctx) +{ + struct nvme_bdev_io *bio = ctx; + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); + struct nvme_bdev *nbdev; + struct nvme_bdev_io *bio_to_abort; + int rc; + + nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; + bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; + + rc = spdk_nvme_ctrlr_cmd_abort_ext(nbdev->nvme_bdev_ctrlr->ctrlr, + NULL, + bio_to_abort, + bdev_nvme_abort_done, bio); + if (rc == -ENOENT) { + /* If no admin command was found in admin qpair, complete the abort + * request with failure. + */ + bio->cpl.cdw0 |= 1U; + bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS; + bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC; + + spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); + } +} + +static int +bdev_nvme_abort(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + int rc; + + bio->orig_thread = spdk_io_channel_get_thread(ch); + + rc = spdk_nvme_ctrlr_cmd_abort_ext(nbdev->nvme_bdev_ctrlr->ctrlr, + nvme_ch->qpair, + bio_to_abort, + bdev_nvme_abort_done, bio); + if (rc == -ENOENT) { + /* If no command was found in I/O qpair, the target command may be + * admin command. Only a single thread tries aborting admin command + * to clean I/O flow. + */ + spdk_thread_send_msg(nbdev->nvme_bdev_ctrlr->thread, + bdev_nvme_abort_admin_cmd, bio); + rc = 0; + } + + return rc; +} + static void bdev_nvme_get_spdk_running_config(FILE *fp) { @@ -2636,31 +2910,31 @@ bdev_nvme_get_spdk_running_config(FILE *fp) const char *trtype; const char *prchk_flags; - trtype = spdk_nvme_transport_id_trtype_str(nvme_bdev_ctrlr->trid.trtype); + trtype = spdk_nvme_transport_id_trtype_str(nvme_bdev_ctrlr->connected_trid->trtype); if (!trtype) { continue; } - if (nvme_bdev_ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { fprintf(fp, "TransportID \"trtype:%s traddr:%s\" %s\n", trtype, - nvme_bdev_ctrlr->trid.traddr, nvme_bdev_ctrlr->name); + nvme_bdev_ctrlr->connected_trid->traddr, nvme_bdev_ctrlr->name); } else { const char *adrfam; - adrfam = spdk_nvme_transport_id_adrfam_str(nvme_bdev_ctrlr->trid.adrfam); + adrfam = spdk_nvme_transport_id_adrfam_str(nvme_bdev_ctrlr->connected_trid->adrfam); prchk_flags = spdk_nvme_prchk_flags_str(nvme_bdev_ctrlr->prchk_flags); if (adrfam) { fprintf(fp, "TransportID \"trtype:%s adrfam:%s traddr:%s trsvcid:%s subnqn:%s\" %s", trtype, adrfam, - nvme_bdev_ctrlr->trid.traddr, nvme_bdev_ctrlr->trid.trsvcid, - nvme_bdev_ctrlr->trid.subnqn, nvme_bdev_ctrlr->name); + nvme_bdev_ctrlr->connected_trid->traddr, nvme_bdev_ctrlr->connected_trid->trsvcid, + nvme_bdev_ctrlr->connected_trid->subnqn, nvme_bdev_ctrlr->name); } else { fprintf(fp, "TransportID \"trtype:%s traddr:%s trsvcid:%s subnqn:%s\" %s", trtype, - nvme_bdev_ctrlr->trid.traddr, nvme_bdev_ctrlr->trid.trsvcid, - nvme_bdev_ctrlr->trid.subnqn, nvme_bdev_ctrlr->name); + nvme_bdev_ctrlr->connected_trid->traddr, nvme_bdev_ctrlr->connected_trid->trsvcid, + nvme_bdev_ctrlr->connected_trid->subnqn, nvme_bdev_ctrlr->name); } if (prchk_flags) { @@ -2768,7 +3042,7 @@ bdev_nvme_config_json(struct spdk_json_write_ctx *w) pthread_mutex_lock(&g_bdev_nvme_mutex); TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { - trid = &nvme_bdev_ctrlr->trid; + trid = nvme_bdev_ctrlr->connected_trid; spdk_json_write_object_begin(w); diff --git a/module/bdev/nvme/bdev_nvme.h b/module/bdev/nvme/bdev_nvme.h index 417c21cade2..388fa91df7a 100644 --- a/module/bdev/nvme/bdev_nvme.h +++ b/module/bdev/nvme/bdev_nvme.h @@ -66,6 +66,8 @@ struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ void bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts); int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts); int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx); +int bdev_nvme_add_trid(const char *name, struct spdk_nvme_transport_id *trid); +int bdev_nvme_remove_trid(const char *name, struct spdk_nvme_transport_id *trid); int bdev_nvme_create(struct spdk_nvme_transport_id *trid, struct spdk_nvme_host_id *hostid, diff --git a/module/bdev/nvme/bdev_nvme_rpc.c b/module/bdev/nvme/bdev_nvme_rpc.c index 2e05726c79c..4366fb4e471 100644 --- a/module/bdev/nvme/bdev_nvme_rpc.c +++ b/module/bdev/nvme/bdev_nvme_rpc.c @@ -257,6 +257,8 @@ rpc_bdev_nvme_attach_controller(struct spdk_jsonrpc_request *request, struct spdk_nvme_transport_id trid = {}; struct spdk_nvme_host_id hostid = {}; uint32_t prchk_flags = 0; + struct nvme_bdev_ctrlr *ctrlr = NULL; + size_t len, maxlen; int rc; ctx = calloc(1, sizeof(*ctx)); @@ -287,8 +289,17 @@ rpc_bdev_nvme_attach_controller(struct spdk_jsonrpc_request *request, rc = spdk_nvme_transport_id_parse_trtype(&trid.trtype, ctx->req.trtype); assert(rc == 0); + ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->req.name); + /* Parse traddr */ - snprintf(trid.traddr, sizeof(trid.traddr), "%s", ctx->req.traddr); + maxlen = sizeof(trid.traddr); + len = strnlen(ctx->req.traddr, maxlen); + if (len == maxlen) { + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "traddr too long: %s", + ctx->req.traddr); + goto cleanup; + } + memcpy(trid.traddr, ctx->req.traddr, len + 1); /* Parse adrfam */ if (ctx->req.adrfam) { @@ -303,7 +314,14 @@ rpc_bdev_nvme_attach_controller(struct spdk_jsonrpc_request *request, /* Parse trsvcid */ if (ctx->req.trsvcid) { - snprintf(trid.trsvcid, sizeof(trid.trsvcid), "%s", ctx->req.trsvcid); + maxlen = sizeof(trid.trsvcid); + len = strnlen(ctx->req.trsvcid, maxlen); + if (len == maxlen) { + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "trsvcid too long: %s", + ctx->req.trsvcid); + goto cleanup; + } + memcpy(trid.trsvcid, ctx->req.trsvcid, len + 1); } /* Parse priority for the NVMe-oF transport connection */ @@ -313,15 +331,41 @@ rpc_bdev_nvme_attach_controller(struct spdk_jsonrpc_request *request, /* Parse subnqn */ if (ctx->req.subnqn) { - snprintf(trid.subnqn, sizeof(trid.subnqn), "%s", ctx->req.subnqn); + maxlen = sizeof(trid.subnqn); + len = strnlen(ctx->req.subnqn, maxlen); + if (len == maxlen) { + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "subnqn too long: %s", + ctx->req.subnqn); + goto cleanup; + } + memcpy(trid.subnqn, ctx->req.subnqn, len + 1); + } + + if (ctrlr && (ctx->req.hostaddr || ctx->req.hostnqn || ctx->req.hostsvcid || ctx->req.prchk_guard || + ctx->req.prchk_reftag)) { + goto conflicting_arguments; } if (ctx->req.hostaddr) { - snprintf(hostid.hostaddr, sizeof(hostid.hostaddr), "%s", ctx->req.hostaddr); + maxlen = sizeof(hostid.hostaddr); + len = strnlen(ctx->req.hostaddr, maxlen); + if (len == maxlen) { + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "hostaddr too long: %s", + ctx->req.hostaddr); + goto cleanup; + } + memcpy(hostid.hostaddr, ctx->req.hostaddr, len + 1); } if (ctx->req.hostsvcid) { - snprintf(hostid.hostsvcid, sizeof(hostid.hostsvcid), "%s", ctx->req.hostsvcid); + maxlen = sizeof(hostid.hostsvcid); + len = strnlen(ctx->req.hostsvcid, maxlen); + if (len == maxlen) { + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "hostsvcid too long: %s", + ctx->req.hostsvcid); + goto cleanup; + } + memcpy(hostid.hostsvcid, ctx->req.hostsvcid, len + 1); } if (ctx->req.prchk_reftag) { @@ -343,6 +387,9 @@ rpc_bdev_nvme_attach_controller(struct spdk_jsonrpc_request *request, return; +conflicting_arguments: + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, + "Invalid agrgument list. Existing controller name cannot be combined with host information or PI options.\n"); cleanup: free_rpc_bdev_nvme_attach_controller(&ctx->req); free(ctx); @@ -357,7 +404,7 @@ rpc_dump_nvme_controller_info(struct spdk_json_write_ctx *w, { struct spdk_nvme_transport_id *trid; - trid = &nvme_bdev_ctrlr->trid; + trid = nvme_bdev_ctrlr->connected_trid; spdk_json_write_object_begin(w); spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name); @@ -442,16 +489,31 @@ SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_get_controllers, get_nvme_controlle struct rpc_bdev_nvme_detach_controller { char *name; + char *trtype; + char *adrfam; + char *traddr; + char *trsvcid; + char *subnqn; }; static void free_rpc_bdev_nvme_detach_controller(struct rpc_bdev_nvme_detach_controller *req) { free(req->name); + free(req->trtype); + free(req->adrfam); + free(req->traddr); + free(req->trsvcid); + free(req->subnqn); } static const struct spdk_json_object_decoder rpc_bdev_nvme_detach_controller_decoders[] = { {"name", offsetof(struct rpc_bdev_nvme_detach_controller, name), spdk_json_decode_string}, + {"trtype", offsetof(struct rpc_bdev_nvme_detach_controller, trtype), spdk_json_decode_string, true}, + {"traddr", offsetof(struct rpc_bdev_nvme_detach_controller, traddr), spdk_json_decode_string, true}, + {"adrfam", offsetof(struct rpc_bdev_nvme_detach_controller, adrfam), spdk_json_decode_string, true}, + {"trsvcid", offsetof(struct rpc_bdev_nvme_detach_controller, trsvcid), spdk_json_decode_string, true}, + {"subnqn", offsetof(struct rpc_bdev_nvme_detach_controller, subnqn), spdk_json_decode_string, true}, }; static void @@ -459,8 +521,11 @@ rpc_bdev_nvme_detach_controller(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) { struct rpc_bdev_nvme_detach_controller req = {NULL}; + struct spdk_nvme_transport_id trid = {}; struct spdk_json_write_ctx *w; + size_t len, maxlen; int rc = 0; + bool all_trid_entries, one_trid_entry; if (spdk_json_decode_object(params, rpc_bdev_nvme_detach_controller_decoders, SPDK_COUNTOF(rpc_bdev_nvme_detach_controller_decoders), @@ -470,7 +535,65 @@ rpc_bdev_nvme_detach_controller(struct spdk_jsonrpc_request *request, goto cleanup; } - rc = bdev_nvme_delete(req.name); + all_trid_entries = req.trtype && req.traddr && req.adrfam && req.trsvcid && req.subnqn; + one_trid_entry = req.trtype || req.traddr || req.adrfam || req.trsvcid || req.subnqn; + + if (all_trid_entries ^ one_trid_entry) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "trtype, traddr, adrfam, trsvcid, subnqn must all be provided together or not at all."); + goto cleanup; + } + + if (all_trid_entries) { + /* Parse trtype */ + rc = spdk_nvme_transport_id_parse_trtype(&trid.trtype, req.trtype); + if (rc < 0) { + SPDK_ERRLOG("Failed to parse trtype: %s\n", req.trtype); + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse trtype: %s", + req.trtype); + goto cleanup; + } + + /* Parse traddr */ + maxlen = sizeof(trid.traddr); + len = strnlen(req.traddr, maxlen); + if (len == maxlen) { + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "traddr too long: %s", + req.traddr); + goto cleanup; + } + memcpy(trid.traddr, req.traddr, len + 1); + + rc = spdk_nvme_transport_id_parse_adrfam(&trid.adrfam, req.adrfam); + if (rc < 0) { + SPDK_ERRLOG("Failed to parse adrfam: %s\n", req.adrfam); + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse adrfam: %s", + req.adrfam); + goto cleanup; + } + + maxlen = sizeof(trid.trsvcid); + len = strnlen(req.trsvcid, maxlen); + if (len == maxlen) { + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "trsvcid too long: %s", + req.trsvcid); + goto cleanup; + } + memcpy(trid.trsvcid, req.trsvcid, len + 1); + + maxlen = sizeof(trid.subnqn); + len = strnlen(req.subnqn, maxlen); + if (len == maxlen) { + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "subnqn too long: %s", + req.subnqn); + goto cleanup; + } + memcpy(trid.subnqn, req.subnqn, len + 1); + rc = bdev_nvme_remove_trid(req.name, &trid); + } else { + rc = bdev_nvme_delete(req.name); + } + if (rc != 0) { spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); goto cleanup; diff --git a/module/bdev/nvme/bdev_ocssd.c b/module/bdev/nvme/bdev_ocssd.c index ddbc240c72d..35f665f404b 100644 --- a/module/bdev/nvme/bdev_ocssd.c +++ b/module/bdev/nvme/bdev_ocssd.c @@ -942,7 +942,7 @@ bdev_ocssd_poll_mm(void *ctx) } } - return 0; + return SPDK_POLLER_BUSY; } void diff --git a/module/bdev/nvme/common.c b/module/bdev/nvme/common.c index 67ee495f0a5..571f2f8a76f 100644 --- a/module/bdev/nvme/common.c +++ b/module/bdev/nvme/common.c @@ -45,7 +45,7 @@ nvme_bdev_ctrlr_get(const struct spdk_nvme_transport_id *trid) struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { - if (spdk_nvme_transport_id_compare(trid, &nvme_bdev_ctrlr->trid) == 0) { + if (spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid) == 0) { return nvme_bdev_ctrlr; } } @@ -116,6 +116,7 @@ static void nvme_bdev_unregister_cb(void *io_device) { struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device; + struct nvme_bdev_ctrlr_trid *trid, *tmp_trid; uint32_t i; pthread_mutex_lock(&g_bdev_nvme_mutex); @@ -127,6 +128,12 @@ nvme_bdev_unregister_cb(void *io_device) for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { free(nvme_bdev_ctrlr->namespaces[i]); } + + TAILQ_FOREACH_SAFE(trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) { + TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, trid, link); + free(trid); + } + free(nvme_bdev_ctrlr->namespaces); free(nvme_bdev_ctrlr); @@ -150,14 +157,14 @@ nvme_bdev_ctrlr_destruct(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) /* If we have already registered a poller, let that one take care of it. */ if (nvme_bdev_ctrlr->destruct_poller != NULL) { pthread_mutex_unlock(&g_bdev_nvme_mutex); - return 1; + return SPDK_POLLER_IDLE; } if (nvme_bdev_ctrlr->resetting) { nvme_bdev_ctrlr->destruct_poller = SPDK_POLLER_REGISTER((spdk_poller_fn)nvme_bdev_ctrlr_destruct, nvme_bdev_ctrlr, 1000); pthread_mutex_unlock(&g_bdev_nvme_mutex); - return 1; + return SPDK_POLLER_BUSY; } pthread_mutex_unlock(&g_bdev_nvme_mutex); @@ -172,7 +179,7 @@ nvme_bdev_ctrlr_destruct(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) } spdk_io_device_unregister(nvme_bdev_ctrlr, nvme_bdev_unregister_cb); - return 1; + return SPDK_POLLER_BUSY; } void diff --git a/module/bdev/nvme/common.h b/module/bdev/nvme/common.h index f5de3edb786..d2bc6adf364 100644 --- a/module/bdev/nvme/common.h +++ b/module/bdev/nvme/common.h @@ -69,37 +69,46 @@ struct nvme_bdev_ns { struct ocssd_bdev_ctrlr; +struct nvme_bdev_ctrlr_trid { + struct spdk_nvme_transport_id trid; + TAILQ_ENTRY(nvme_bdev_ctrlr_trid) link; +}; + struct nvme_bdev_ctrlr { /** * points to pinned, physically contiguous memory region; * contains 4KB IDENTIFY structure for controller which is * target for CONTROLLER IDENTIFY command during initialization */ - struct spdk_nvme_ctrlr *ctrlr; - struct spdk_nvme_transport_id trid; - char *name; - int ref; - bool resetting; - bool destruct; + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_transport_id *connected_trid; + char *name; + int ref; + bool resetting; + bool failover_in_progress; + bool destruct; /** * PI check flags. This flags is set to NVMe controllers created only * through bdev_nvme_attach_controller RPC or .INI config file. Hot added * NVMe controllers are not included. */ - uint32_t prchk_flags; - uint32_t num_ns; + uint32_t prchk_flags; + uint32_t num_ns; /** Array of pointers to namespaces indexed by nsid - 1 */ - struct nvme_bdev_ns **namespaces; + struct nvme_bdev_ns **namespaces; - struct spdk_opal_dev *opal_dev; + struct spdk_opal_dev *opal_dev; - struct spdk_poller *adminq_timer_poller; - struct spdk_poller *destruct_poller; + struct spdk_poller *adminq_timer_poller; + struct spdk_poller *destruct_poller; + struct spdk_thread *thread; - struct ocssd_bdev_ctrlr *ocssd_ctrlr; + struct ocssd_bdev_ctrlr *ocssd_ctrlr; /** linked list pointer for device list */ - TAILQ_ENTRY(nvme_bdev_ctrlr) tailq; + TAILQ_ENTRY(nvme_bdev_ctrlr) tailq; + + TAILQ_HEAD(, nvme_bdev_ctrlr_trid) trids; }; struct nvme_bdev { diff --git a/module/bdev/nvme/vbdev_opal.c b/module/bdev/nvme/vbdev_opal.c index cdbcc558823..68281c92b79 100644 --- a/module/bdev/nvme/vbdev_opal.c +++ b/module/bdev/nvme/vbdev_opal.c @@ -40,21 +40,16 @@ /* OPAL locking range only supports operations on nsid=1 for now */ #define NSID_SUPPORTED 1 -struct spdk_vbdev_opal_config { - char *nvme_ctrlr_name; - uint8_t locking_range_id; - uint64_t range_start; - uint64_t range_length; - SPDK_BDEV_PART_TAILQ part_tailq; - struct vbdev_opal_part_base *opal_base; -}; - struct opal_vbdev { char *name; struct nvme_bdev_ctrlr *nvme_ctrlr; struct spdk_opal_dev *opal_dev; struct spdk_bdev_part *bdev_part; - struct spdk_vbdev_opal_config cfg; + + uint8_t locking_range_id; + uint64_t range_start; + uint64_t range_length; + struct vbdev_opal_part_base *opal_base; TAILQ_ENTRY(opal_vbdev) tailq; }; @@ -75,7 +70,7 @@ struct vbdev_opal_channel { struct vbdev_opal_part_base { char *nvme_ctrlr_name; struct spdk_bdev_part_base *part_base; - int num_of_part; + SPDK_BDEV_PART_TAILQ part_tailq; TAILQ_ENTRY(vbdev_opal_part_base) tailq; }; @@ -90,7 +85,6 @@ vbdev_opal_delete(struct opal_vbdev *opal_bdev) { TAILQ_REMOVE(&g_opal_vbdev, opal_bdev, tailq); free(opal_bdev->name); - free(opal_bdev->cfg.nvme_ctrlr_name); free(opal_bdev); opal_bdev = NULL; } @@ -151,6 +145,8 @@ vbdev_opal_base_free(void *ctx) { struct vbdev_opal_part_base *base = ctx; + TAILQ_REMOVE(&g_opal_base, base, tailq); + free(base->nvme_ctrlr_name); free(base); } @@ -252,7 +248,7 @@ vbdev_opal_get_info_from_bdev(const char *opal_bdev_name, const char *password) return NULL; } - locking_range_id = vbdev->cfg.locking_range_id; + locking_range_id = vbdev->locking_range_id; rc = spdk_opal_cmd_get_locking_range_info(nvme_ctrlr->opal_dev, password, OPAL_ADMIN1, locking_range_id); if (rc) { @@ -318,13 +314,6 @@ static struct spdk_bdev_module opal_if = { SPDK_BDEV_MODULE_REGISTER(opal, &opal_if) -static void -vbdev_opal_free_bdev(struct opal_vbdev *opal_bdev) -{ - free(opal_bdev->cfg.nvme_ctrlr_name); - free(opal_bdev); -} - int vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_range_id, uint64_t range_start, uint64_t range_length, const char *password) @@ -334,9 +323,8 @@ vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_ra char *base_bdev_name; struct nvme_bdev_ctrlr *nvme_ctrlr; struct opal_vbdev *opal_bdev; - struct vbdev_opal_part_base *opal_part_base; + struct vbdev_opal_part_base *opal_part_base = NULL; struct spdk_bdev_part *part_bdev; - struct spdk_vbdev_opal_config *cfg; struct nvme_bdev *nvme_bdev; if (nsid != NSID_SUPPORTED) { @@ -361,17 +349,9 @@ vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_ra return -ENOMEM; } - cfg = &opal_bdev->cfg; - cfg->nvme_ctrlr_name = strdup(nvme_ctrlr_name); - if (!cfg->nvme_ctrlr_name) { - SPDK_ERRLOG("allocation for nvme_ctrlr_name failed\n"); - free(opal_bdev); - return -ENOMEM; - } - - cfg->locking_range_id = locking_range_id; - cfg->range_start = range_start; - cfg->range_length = range_length; + opal_bdev->locking_range_id = locking_range_id; + opal_bdev->range_start = range_start; + opal_bdev->range_length = range_length; opal_bdev->nvme_ctrlr = nvme_ctrlr; opal_bdev->opal_dev = nvme_ctrlr->opal_dev; @@ -383,53 +363,53 @@ vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_ra /* traverse base list to see if part_base is already create for this base bdev */ TAILQ_FOREACH(opal_part_base, &g_opal_base, tailq) { if (!strcmp(spdk_bdev_part_base_get_bdev_name(opal_part_base->part_base), base_bdev_name)) { - cfg->opal_base = opal_part_base; + break; } } /* If there is not a corresponding opal_part_base, a new opal_part_base will be created. For each new part_base, there will be one tailq to store all the parts of this base */ - if (cfg->opal_base == NULL) { - TAILQ_INIT(&cfg->part_tailq); + if (opal_part_base == NULL) { opal_part_base = calloc(1, sizeof(*opal_part_base)); if (opal_part_base == NULL) { SPDK_ERRLOG("Could not allocate opal_part_base\n"); - vbdev_opal_free_bdev(opal_bdev); + free(opal_bdev); return -ENOMEM; } + TAILQ_INIT(&opal_part_base->part_tailq); opal_part_base->part_base = spdk_bdev_part_base_construct(spdk_bdev_get_by_name(base_bdev_name), vbdev_opal_base_bdev_hotremove_cb, &opal_if, - &opal_vbdev_fn_table, &cfg->part_tailq, vbdev_opal_base_free, opal_part_base, - sizeof(struct vbdev_opal_channel), NULL, NULL); + &opal_vbdev_fn_table, &opal_part_base->part_tailq, vbdev_opal_base_free, + opal_part_base, sizeof(struct vbdev_opal_channel), NULL, NULL); if (opal_part_base->part_base == NULL) { SPDK_ERRLOG("Could not allocate part_base\n"); - vbdev_opal_free_bdev(opal_bdev); + free(opal_bdev); free(opal_part_base); return -ENOMEM; } - opal_part_base->num_of_part = 0; - opal_part_base->nvme_ctrlr_name = strdup(cfg->nvme_ctrlr_name); + opal_part_base->nvme_ctrlr_name = strdup(nvme_ctrlr_name); if (opal_part_base->nvme_ctrlr_name == NULL) { - vbdev_opal_free_bdev(opal_bdev); + free(opal_bdev); spdk_bdev_part_base_free(opal_part_base->part_base); return -ENOMEM; } - cfg->opal_base = opal_part_base; TAILQ_INSERT_TAIL(&g_opal_base, opal_part_base, tailq); } + assert(opal_part_base != NULL); + opal_bdev->opal_base = opal_part_base; part_bdev = calloc(1, sizeof(struct spdk_bdev_part)); if (!part_bdev) { SPDK_ERRLOG("Could not allocate part_bdev\n"); - vbdev_opal_free_bdev(opal_bdev); + free(opal_bdev); return -ENOMEM; } TAILQ_INSERT_TAIL(&g_opal_vbdev, opal_bdev, tailq); opal_vbdev_name = spdk_sprintf_alloc("%sr%" PRIu8, base_bdev_name, - cfg->locking_range_id); /* e.g.: nvme0n1r1 */ + opal_bdev->locking_range_id); /* e.g.: nvme0n1r1 */ if (opal_vbdev_name == NULL) { SPDK_ERRLOG("Could not allocate opal_vbdev_name\n"); rc = -ENOMEM; @@ -438,14 +418,15 @@ vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_ra opal_bdev->name = opal_vbdev_name; rc = spdk_opal_cmd_setup_locking_range(opal_bdev->opal_dev, OPAL_ADMIN1, - cfg->locking_range_id, cfg->range_start, cfg->range_length, password); + opal_bdev->locking_range_id, opal_bdev->range_start, + opal_bdev->range_length, password); if (rc) { SPDK_ERRLOG("Error construct %s\n", opal_vbdev_name); goto err; } - rc = spdk_bdev_part_construct(part_bdev, cfg->opal_base->part_base, opal_vbdev_name, - cfg->range_start, cfg->range_length, "Opal locking range"); + rc = spdk_bdev_part_construct(part_bdev, opal_bdev->opal_base->part_base, opal_vbdev_name, + opal_bdev->range_start, opal_bdev->range_length, "Opal locking range"); if (rc) { SPDK_ERRLOG("Could not allocate bdev part\n"); goto err; @@ -460,7 +441,6 @@ vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_ra } opal_bdev->bdev_part = part_bdev; - cfg->opal_base->num_of_part++; return 0; err: @@ -472,25 +452,13 @@ vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_ra static void vbdev_opal_destruct_bdev(struct opal_vbdev *opal_bdev) { - SPDK_BDEV_PART_TAILQ *opal_part_tailq; - struct spdk_bdev_part *part; - struct spdk_vbdev_opal_config *cfg = &opal_bdev->cfg; - - if (cfg->opal_base != NULL) { - part = opal_bdev->bdev_part; - opal_part_tailq = spdk_bdev_part_base_get_tailq(cfg->opal_base->part_base); - if (cfg->range_start == spdk_bdev_part_get_offset_blocks(part)) { - if (cfg->opal_base->num_of_part <= 1) { - /* if there is only one part for this base, we can remove the base now */ - spdk_bdev_part_base_hotremove(cfg->opal_base->part_base, opal_part_tailq); - - /* remove from the tailq vbdev_opal_part_base */ - TAILQ_REMOVE(&g_opal_base, cfg->opal_base, tailq); - } else { - spdk_bdev_unregister(spdk_bdev_part_get_bdev(part), NULL, NULL); - cfg->opal_base->num_of_part--; - } - } + struct spdk_bdev_part *part = opal_bdev->bdev_part; + + assert(opal_bdev->opal_base != NULL); + assert(part != NULL); + + if (opal_bdev->range_start == spdk_bdev_part_get_offset_blocks(part)) { + spdk_bdev_unregister(spdk_bdev_part_get_bdev(part), NULL, NULL); } vbdev_opal_delete(opal_bdev); } @@ -498,7 +466,6 @@ vbdev_opal_destruct_bdev(struct opal_vbdev *opal_bdev) int vbdev_opal_destruct(const char *bdev_name, const char *password) { - struct spdk_vbdev_opal_config *cfg; struct nvme_bdev_ctrlr *nvme_ctrlr; int locking_range_id; int rc; @@ -516,8 +483,7 @@ vbdev_opal_destruct(const char *bdev_name, const char *password) goto err; } - cfg = &opal_bdev->cfg; - locking_range_id = cfg->locking_range_id; + locking_range_id = opal_bdev->locking_range_id; nvme_ctrlr = opal_bdev->nvme_ctrlr; if (nvme_ctrlr == NULL) { @@ -594,7 +560,7 @@ vbdev_opal_set_lock_state(const char *bdev_name, uint16_t user_id, const char *p return -EINVAL; } - locking_range_id = opal_bdev->cfg.locking_range_id; + locking_range_id = opal_bdev->locking_range_id; rc = spdk_opal_cmd_lock_unlock(nvme_ctrlr->opal_dev, user_id, state_flag, locking_range_id, password); if (rc) { @@ -643,7 +609,7 @@ vbdev_opal_enable_new_user(const char *bdev_name, const char *admin_password, ui return rc; } - locking_range_id = opal_bdev->cfg.locking_range_id; + locking_range_id = opal_bdev->locking_range_id; rc = spdk_opal_cmd_add_user_to_locking_range(nvme_ctrlr->opal_dev, user_id, locking_range_id, OPAL_READONLY, admin_password); if (rc) { diff --git a/module/bdev/ocf/ctx.c b/module/bdev/ocf/ctx.c index 3bcb422ad43..5bf4c8fee72 100644 --- a/module/bdev/ocf/ctx.c +++ b/module/bdev/ocf/ctx.c @@ -342,13 +342,13 @@ cleaner_poll(void *arg) if (spdk_get_ticks() >= priv->next_run) { ocf_cleaner_run(cleaner, priv->queue); - return 1; + return SPDK_POLLER_BUSY; } if (iono > 0) { - return 1; + return SPDK_POLLER_BUSY; } else { - return 0; + return SPDK_POLLER_IDLE; } } diff --git a/module/bdev/ocf/vbdev_ocf.c b/module/bdev/ocf/vbdev_ocf.c index 4a714b94180..2a32dca8f81 100644 --- a/module/bdev/ocf/vbdev_ocf.c +++ b/module/bdev/ocf/vbdev_ocf.c @@ -135,6 +135,15 @@ get_other_cache_base(struct vbdev_ocf_base *base) return NULL; } +static bool +is_ocf_cache_running(struct vbdev_ocf *vbdev) +{ + if (vbdev->cache.attached && vbdev->ocf_cache) { + return ocf_cache_is_running(vbdev->ocf_cache); + } + return false; +} + /* Get existing OCF cache instance * that is started by other vbdev */ static ocf_cache_t @@ -149,7 +158,7 @@ get_other_cache_instance(struct vbdev_ocf *vbdev) if (strcmp(cmp->cache.name, vbdev->cache.name)) { continue; } - if (cmp->ocf_cache) { + if (is_ocf_cache_running(cmp)) { return cmp->ocf_cache; } } @@ -190,6 +199,7 @@ static void unregister_finish(struct vbdev_ocf *vbdev) { spdk_bdev_destruct_done(&vbdev->exp_bdev, vbdev->state.stop_status); + ocf_mngt_cache_put(vbdev->ocf_cache); vbdev_ocf_cache_ctx_put(vbdev->cache_ctx); vbdev_ocf_mngt_continue(vbdev, 0); } @@ -230,7 +240,7 @@ remove_core_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) static void detach_core(struct vbdev_ocf *vbdev) { - if (vbdev->ocf_cache && ocf_cache_is_running(vbdev->ocf_cache)) { + if (is_ocf_cache_running(vbdev)) { ocf_mngt_cache_lock(vbdev->ocf_cache, remove_core_cache_lock_cmpl, vbdev); } else { vbdev_ocf_mngt_continue(vbdev, 0); @@ -291,7 +301,7 @@ stop_vbdev_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) static void stop_vbdev(struct vbdev_ocf *vbdev) { - if (!ocf_cache_is_running(vbdev->ocf_cache)) { + if (!is_ocf_cache_running(vbdev)) { vbdev_ocf_mngt_continue(vbdev, 0); return; } @@ -334,7 +344,7 @@ flush_vbdev_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) static void flush_vbdev(struct vbdev_ocf *vbdev) { - if (!ocf_cache_is_running(vbdev->ocf_cache)) { + if (!is_ocf_cache_running(vbdev)) { vbdev_ocf_mngt_continue(vbdev, -EINVAL); return; } @@ -751,6 +761,8 @@ vbdev_ocf_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx * spdk_json_write_named_string(w, "name", vbdev->name); spdk_json_write_named_string(w, "mode", ocf_get_cache_modename(ocf_cache_get_mode(vbdev->ocf_cache))); + spdk_json_write_named_uint32(w, "cache_line_size", + ocf_cache_get_line_size(vbdev->ocf_cache)); spdk_json_write_named_string(w, "cache_bdev_name", vbdev->cache.name); spdk_json_write_named_string(w, "core_bdev_name", vbdev->core.name); spdk_json_write_object_end(w); @@ -783,9 +795,9 @@ queue_poll(void *opaque) } if (iono > 0) { - return 1; + return SPDK_POLLER_BUSY; } else { - return 0; + return SPDK_POLLER_IDLE; } } @@ -891,9 +903,9 @@ mngt_queue_poll(void *opaque) } if (iono > 0) { - return 1; + return SPDK_POLLER_BUSY; } else { - return 0; + return SPDK_POLLER_IDLE; } } @@ -1040,7 +1052,7 @@ start_cache(struct vbdev_ocf *vbdev) ocf_cache_t existing; int rc; - if (vbdev->ocf_cache) { + if (is_ocf_cache_running(vbdev)) { vbdev_ocf_mngt_stop(vbdev, NULL, -EALREADY); return; } @@ -1050,6 +1062,7 @@ start_cache(struct vbdev_ocf *vbdev) SPDK_NOTICELOG("OCF bdev %s connects to existing cache device %s\n", vbdev->name, vbdev->cache.name); vbdev->ocf_cache = existing; + ocf_mngt_cache_get(vbdev->ocf_cache); vbdev->cache_ctx = ocf_cache_get_priv(existing); vbdev_ocf_cache_ctx_get(vbdev->cache_ctx); vbdev_ocf_mngt_continue(vbdev, 0); @@ -1070,6 +1083,7 @@ start_cache(struct vbdev_ocf *vbdev) vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, rc); return; } + ocf_mngt_cache_get(vbdev->ocf_cache); ocf_cache_set_priv(vbdev->ocf_cache, vbdev->cache_ctx); @@ -1127,17 +1141,11 @@ init_vbdev_config(struct vbdev_ocf *vbdev) * metadata support */ cfg->cache.metadata_volatile = false; - /* TODO [cache line size]: make cache line size configurable - * Using standard 4KiB for now */ - cfg->cache.cache_line_size = ocf_cache_line_size_4; - /* This are suggested values that * should be sufficient for most use cases */ cfg->cache.backfill.max_queue_size = 65536; cfg->cache.backfill.queue_unblock_size = 60000; - /* TODO [cache line size] */ - cfg->device.cache_line_size = ocf_cache_line_size_4; cfg->device.force = true; cfg->device.perform_test = false; cfg->device.discard_on_start = false; @@ -1172,6 +1180,7 @@ init_vbdev_config(struct vbdev_ocf *vbdev) static int init_vbdev(const char *vbdev_name, const char *cache_mode_name, + const uint64_t cache_line_size, const char *cache_name, const char *core_name, bool loadq) @@ -1208,6 +1217,17 @@ init_vbdev(const char *vbdev_name, goto error_free; } + ocf_cache_line_size_t set_cache_line_size = cache_line_size ? + (ocf_cache_line_size_t)cache_line_size * KiB : + ocf_cache_line_size_default; + if (set_cache_line_size == 0) { + SPDK_ERRLOG("Cache line size should be non-zero.\n"); + rc = -EINVAL; + goto error_free; + } + vbdev->cfg.device.cache_line_size = set_cache_line_size; + vbdev->cfg.cache.cache_line_size = set_cache_line_size; + vbdev->name = strdup(vbdev_name); if (!vbdev->name) { goto error_mem; @@ -1240,9 +1260,10 @@ init_vbdev(const char *vbdev_name, static int vbdev_ocf_init(void) { - const char *vbdev_name, *modename, *cache_name, *core_name; + const char *vbdev_name, *modename, *cache_line_size, *cache_name, *core_name; struct spdk_conf_section *sp; int status; + uint64_t cache_line_size_uint64; status = vbdev_ocf_ctx_init(); if (status) { @@ -1279,19 +1300,26 @@ vbdev_ocf_init(void) continue; } - cache_name = spdk_conf_section_get_nmval(sp, "OCF", i, 2); + cache_line_size = spdk_conf_section_get_nmval(sp, "OCF", i, 2); + if (!cache_line_size) { + SPDK_ERRLOG("No cache line size specified for OCF vbdev '%s'\n", vbdev_name); + continue; + } + cache_line_size_uint64 = strtoull(cache_line_size, NULL, 10); + + cache_name = spdk_conf_section_get_nmval(sp, "OCF", i, 3); if (!cache_name) { SPDK_ERRLOG("No cache device specified for OCF vbdev '%s'\n", vbdev_name); continue; } - core_name = spdk_conf_section_get_nmval(sp, "OCF", i, 3); + core_name = spdk_conf_section_get_nmval(sp, "OCF", i, 4); if (!core_name) { SPDK_ERRLOG("No core devices specified for OCF vbdev '%s'\n", vbdev_name); continue; } - status = init_vbdev(vbdev_name, modename, cache_name, core_name, false); + status = init_vbdev(vbdev_name, modename, cache_line_size_uint64, cache_name, core_name, false); if (status) { SPDK_ERRLOG("Config initialization failed with code: %d\n", status); } @@ -1425,6 +1453,7 @@ attach_base_bdevs(struct vbdev_ocf *vbdev, void vbdev_ocf_construct(const char *vbdev_name, const char *cache_mode_name, + const uint64_t cache_line_size, const char *cache_name, const char *core_name, bool loadq, @@ -1436,7 +1465,7 @@ vbdev_ocf_construct(const char *vbdev_name, struct spdk_bdev *core_bdev = spdk_bdev_get_by_name(core_name); struct vbdev_ocf *vbdev; - rc = init_vbdev(vbdev_name, cache_mode_name, cache_name, core_name, loadq); + rc = init_vbdev(vbdev_name, cache_mode_name, cache_line_size, cache_name, core_name, loadq); if (rc) { cb(rc, NULL, cb_arg); return; @@ -1590,7 +1619,7 @@ metadata_probe_cores_construct(void *priv, int error, unsigned int num_cores) } ctx->refcnt++; - vbdev_ocf_construct(vbdev_name, NULL, cache_name, core_name, true, + vbdev_ocf_construct(vbdev_name, NULL, 0, cache_name, core_name, true, metadata_probe_construct_cb, ctx); } diff --git a/module/bdev/ocf/vbdev_ocf.h b/module/bdev/ocf/vbdev_ocf.h index d0fd0b183d4..b313e9e0ca3 100644 --- a/module/bdev/ocf/vbdev_ocf.h +++ b/module/bdev/ocf/vbdev_ocf.h @@ -185,6 +185,7 @@ struct vbdev_ocf { void vbdev_ocf_construct( const char *vbdev_name, const char *cache_mode_name, + const uint64_t cache_line_size, const char *cache_name, const char *core_name, bool loadq, diff --git a/module/bdev/ocf/vbdev_ocf_rpc.c b/module/bdev/ocf/vbdev_ocf_rpc.c index 89286fe2300..8cba6509349 100644 --- a/module/bdev/ocf/vbdev_ocf_rpc.c +++ b/module/bdev/ocf/vbdev_ocf_rpc.c @@ -39,10 +39,11 @@ /* Structure to hold the parameters for this RPC method. */ struct rpc_bdev_ocf_create { - char *name; /* master vbdev */ - char *mode; /* OCF mode (choose one) */ - char *cache_bdev_name; /* sub bdev */ - char *core_bdev_name; /* sub bdev */ + char *name; /* master vbdev */ + char *mode; /* OCF mode (choose one) */ + uint64_t cache_line_size; /* OCF cache line size */ + char *cache_bdev_name; /* sub bdev */ + char *core_bdev_name; /* sub bdev */ }; static void @@ -58,6 +59,7 @@ free_rpc_bdev_ocf_create(struct rpc_bdev_ocf_create *r) static const struct spdk_json_object_decoder rpc_bdev_ocf_create_decoders[] = { {"name", offsetof(struct rpc_bdev_ocf_create, name), spdk_json_decode_string}, {"mode", offsetof(struct rpc_bdev_ocf_create, mode), spdk_json_decode_string}, + {"cache_line_size", offsetof(struct rpc_bdev_ocf_create, cache_line_size), spdk_json_decode_uint64, true}, {"cache_bdev_name", offsetof(struct rpc_bdev_ocf_create, cache_bdev_name), spdk_json_decode_string}, {"core_bdev_name", offsetof(struct rpc_bdev_ocf_create, core_bdev_name), spdk_json_decode_string}, }; @@ -96,8 +98,8 @@ rpc_bdev_ocf_create(struct spdk_jsonrpc_request *request, return; } - vbdev_ocf_construct(req.name, req.mode, req.cache_bdev_name, req.core_bdev_name, false, - construct_cb, request); + vbdev_ocf_construct(req.name, req.mode, req.cache_line_size, req.cache_bdev_name, + req.core_bdev_name, false, construct_cb, request); free_rpc_bdev_ocf_create(&req); } SPDK_RPC_REGISTER("bdev_ocf_create", rpc_bdev_ocf_create, SPDK_RPC_RUNTIME) diff --git a/module/bdev/raid/bdev_raid.c b/module/bdev/raid/bdev_raid.c index 10da1a799e1..f3fdfb182ad 100644 --- a/module/bdev/raid/bdev_raid.c +++ b/module/bdev/raid/bdev_raid.c @@ -665,7 +665,7 @@ raid_bdev_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx * spdk_json_write_named_object_begin(w, "params"); spdk_json_write_named_string(w, "name", bdev->name); - spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size_kb); + spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb); spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level)); spdk_json_write_named_array_begin(w, "base_bdevs"); diff --git a/module/bdev/rbd/bdev_rbd.c b/module/bdev/rbd/bdev_rbd.c index 01fad34854f..eb9e25c2ad9 100644 --- a/module/bdev/rbd/bdev_rbd.c +++ b/module/bdev/rbd/bdev_rbd.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "spdk/conf.h" #include "spdk/env.h" @@ -46,16 +47,16 @@ #include "spdk/json.h" #include "spdk/string.h" #include "spdk/util.h" +#include "spdk/likely.h" #include "spdk/bdev_module.h" #include "spdk_internal/log.h" #define SPDK_RBD_QUEUE_DEPTH 128 +#define MAX_EVENTS_PER_POLL 128 static int bdev_rbd_count = 0; -#define BDEV_RBD_POLL_US 50 - struct bdev_rbd { struct spdk_bdev disk; char *rbd_name; @@ -68,19 +69,22 @@ struct bdev_rbd { struct spdk_bdev_io *reset_bdev_io; }; +struct bdev_rbd_group_channel { + struct spdk_poller *poller; + int epoll_fd; +}; + struct bdev_rbd_io_channel { rados_ioctx_t io_ctx; rados_t cluster; - struct pollfd pfd; + int pfd; rbd_image_t image; struct bdev_rbd *disk; - struct spdk_poller *poller; + struct bdev_rbd_group_channel *group_ch; }; struct bdev_rbd_io { - uint64_t remaining_len; - int num_segments; - bool failed; + size_t total_len; }; static void @@ -234,11 +238,14 @@ bdev_rbd_finish_aiocb(rbd_completion_t cb, void *arg) } static int -bdev_rbd_start_aio(rbd_image_t image, struct spdk_bdev_io *bdev_io, - void *buf, uint64_t offset, size_t len) +bdev_rbd_start_aio(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + struct iovec *iov, int iovcnt, uint64_t offset, size_t len) { + struct bdev_rbd_io_channel *rbdio_ch = spdk_io_channel_get_ctx(ch); int ret; rbd_completion_t comp; + struct bdev_rbd_io *rbd_io; + rbd_image_t image = rbdio_ch->image; ret = rbd_aio_create_completion(bdev_io, bdev_rbd_finish_aiocb, &comp); @@ -247,11 +254,19 @@ bdev_rbd_start_aio(rbd_image_t image, struct spdk_bdev_io *bdev_io, } if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { - ret = rbd_aio_read(image, offset, len, - buf, comp); + rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; + rbd_io->total_len = len; + if (spdk_likely(iovcnt == 1)) { + ret = rbd_aio_read(image, offset, iov[0].iov_len, iov[0].iov_base, comp); + } else { + ret = rbd_aio_readv(image, iov, iovcnt, offset, comp); + } } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { - ret = rbd_aio_write(image, offset, len, - buf, comp); + if (spdk_likely(iovcnt == 1)) { + ret = rbd_aio_write(image, offset, iov[0].iov_len, iov[0].iov_base, comp); + } else { + ret = rbd_aio_writev(image, iov, iovcnt, offset, comp); + } } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_FLUSH) { ret = rbd_aio_flush(image, comp); } @@ -266,6 +281,8 @@ bdev_rbd_start_aio(rbd_image_t image, struct spdk_bdev_io *bdev_io, static int bdev_rbd_library_init(void); +static void bdev_rbd_library_fini(void); + static int bdev_rbd_get_ctx_size(void) { @@ -275,65 +292,12 @@ bdev_rbd_get_ctx_size(void) static struct spdk_bdev_module rbd_if = { .name = "rbd", .module_init = bdev_rbd_library_init, + .module_fini = bdev_rbd_library_fini, .get_ctx_size = bdev_rbd_get_ctx_size, }; SPDK_BDEV_MODULE_REGISTER(rbd, &rbd_if) -static int64_t -bdev_rbd_rw(struct bdev_rbd *disk, struct spdk_io_channel *ch, - struct spdk_bdev_io *bdev_io, struct iovec *iov, - int iovcnt, size_t len, uint64_t offset) -{ - struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; - struct bdev_rbd_io_channel *rbdio_ch = spdk_io_channel_get_ctx(ch); - size_t remaining = len; - int i, rc; - - rbd_io->remaining_len = 0; - rbd_io->num_segments = 0; - rbd_io->failed = false; - - for (i = 0; i < iovcnt && remaining > 0; i++) { - size_t seg_len = spdk_min(remaining, iov[i].iov_len); - - rc = bdev_rbd_start_aio(rbdio_ch->image, bdev_io, iov[i].iov_base, offset, seg_len); - if (rc) { - /* - * This bdev_rbd_start_aio() call failed, but if any previous ones were - * submitted, we need to wait for them to finish. - */ - if (rbd_io->num_segments == 0) { - /* No previous I/O submitted - return error code immediately. */ - return rc; - } - - /* Return and wait for outstanding I/O to complete. */ - rbd_io->failed = true; - return 0; - } - - rbd_io->num_segments++; - rbd_io->remaining_len += seg_len; - - offset += seg_len; - remaining -= seg_len; - } - - return 0; -} - -static int64_t -bdev_rbd_flush(struct bdev_rbd *disk, struct spdk_io_channel *ch, - struct spdk_bdev_io *bdev_io, uint64_t offset, uint64_t nbytes) -{ - struct bdev_rbd_io_channel *rbdio_ch = spdk_io_channel_get_ctx(ch); - struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; - - rbd_io->num_segments++; - return bdev_rbd_start_aio(rbdio_ch->image, bdev_io, NULL, offset, nbytes); -} - static int bdev_rbd_reset_timer(void *arg) { @@ -347,7 +311,7 @@ bdev_rbd_reset_timer(void *arg) spdk_poller_unregister(&disk->reset_timer); disk->reset_bdev_io = NULL; - return -1; + return SPDK_POLLER_BUSY; } static int @@ -386,13 +350,12 @@ bdev_rbd_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, return; } - ret = bdev_rbd_rw(bdev_io->bdev->ctxt, - ch, - bdev_io, - bdev_io->u.bdev.iovs, - bdev_io->u.bdev.iovcnt, - bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, - bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + ret = bdev_rbd_start_aio(ch, + bdev_io, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); if (ret != 0) { spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); @@ -408,20 +371,13 @@ static int _bdev_rbd_submit_request(struct spdk_io_channel *ch, struct spdk_bdev return 0; case SPDK_BDEV_IO_TYPE_WRITE: - return bdev_rbd_rw((struct bdev_rbd *)bdev_io->bdev->ctxt, - ch, - bdev_io, - bdev_io->u.bdev.iovs, - bdev_io->u.bdev.iovcnt, - bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, - bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); - case SPDK_BDEV_IO_TYPE_FLUSH: - return bdev_rbd_flush((struct bdev_rbd *)bdev_io->bdev->ctxt, - ch, - bdev_io, - bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen, - bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return bdev_rbd_start_aio(ch, + bdev_io, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); case SPDK_BDEV_IO_TYPE_RESET: return bdev_rbd_reset((struct bdev_rbd *)bdev_io->bdev->ctxt, @@ -455,56 +411,37 @@ bdev_rbd_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) } } -static int -bdev_rbd_io_poll(void *arg) +static void +bdev_rbd_io_poll(struct bdev_rbd_io_channel *ch) { - struct bdev_rbd_io_channel *ch = arg; int i, io_status, rc; rbd_completion_t comps[SPDK_RBD_QUEUE_DEPTH]; struct spdk_bdev_io *bdev_io; struct bdev_rbd_io *rbd_io; - - rc = poll(&ch->pfd, 1, 0); - - /* check the return value of poll since we have only one fd for each channel */ - if (rc != 1) { - return 0; - } + enum spdk_bdev_io_status bio_status; rc = rbd_poll_io_events(ch->image, comps, SPDK_RBD_QUEUE_DEPTH); for (i = 0; i < rc; i++) { bdev_io = rbd_aio_get_arg(comps[i]); rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; io_status = rbd_aio_get_return_value(comps[i]); - - assert(rbd_io->num_segments > 0); - rbd_io->num_segments--; + bio_status = SPDK_BDEV_IO_STATUS_SUCCESS; if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { - if (io_status > 0) { - /* For reads, io_status is the length */ - rbd_io->remaining_len -= io_status; - } - - if (rbd_io->num_segments == 0 && rbd_io->remaining_len != 0) { - rbd_io->failed = true; + if ((int)rbd_io->total_len != io_status) { + bio_status = SPDK_BDEV_IO_STATUS_FAILED; } } else { /* For others, 0 means success */ if (io_status != 0) { - rbd_io->failed = true; + bio_status = SPDK_BDEV_IO_STATUS_FAILED; } } rbd_aio_release(comps[i]); - if (rbd_io->num_segments == 0) { - spdk_bdev_io_complete(bdev_io, - rbd_io->failed ? SPDK_BDEV_IO_STATUS_FAILED : SPDK_BDEV_IO_STATUS_SUCCESS); - } + spdk_bdev_io_complete(bdev_io, bio_status); } - - return rc; } static void @@ -526,8 +463,12 @@ bdev_rbd_free_channel(struct bdev_rbd_io_channel *ch) rados_shutdown(ch->cluster); } - if (ch->pfd.fd >= 0) { - close(ch->pfd.fd); + if (ch->pfd >= 0) { + close(ch->pfd); + } + + if (ch->group_ch) { + spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); } } @@ -536,12 +477,24 @@ bdev_rbd_handle(void *arg) { struct bdev_rbd_io_channel *ch = arg; void *ret = arg; + int rc; + + rc = bdev_rados_context_init(ch->disk->user_id, ch->disk->pool_name, + (const char *const *)ch->disk->config, + &ch->cluster, &ch->io_ctx); + if (rc < 0) { + SPDK_ERRLOG("Failed to create rados context for user_id %s and rbd_pool=%s\n", + ch->disk->user_id ? ch->disk->user_id : "admin (the default)", ch->disk->pool_name); + ret = NULL; + goto end; + } if (rbd_open(ch->io_ctx, ch->disk->rbd_name, &ch->image, NULL) < 0) { SPDK_ERRLOG("Failed to open specified rbd device\n"); ret = NULL; } +end: return ret; } @@ -550,39 +503,41 @@ bdev_rbd_create_cb(void *io_device, void *ctx_buf) { struct bdev_rbd_io_channel *ch = ctx_buf; int ret; + struct epoll_event event; ch->disk = io_device; ch->image = NULL; ch->io_ctx = NULL; - ch->pfd.fd = -1; - - ret = bdev_rados_context_init(ch->disk->user_id, ch->disk->pool_name, - (const char *const *)ch->disk->config, - &ch->cluster, &ch->io_ctx); - if (ret < 0) { - SPDK_ERRLOG("Failed to create rados context for user_id %s and rbd_pool=%s\n", - ch->disk->user_id ? ch->disk->user_id : "admin (the default)", ch->disk->pool_name); - goto err; - } + ch->pfd = -1; if (spdk_call_unaffinitized(bdev_rbd_handle, ch) == NULL) { goto err; } - ch->pfd.fd = eventfd(0, EFD_NONBLOCK); - if (ch->pfd.fd < 0) { + ch->pfd = eventfd(0, EFD_NONBLOCK); + if (ch->pfd < 0) { SPDK_ERRLOG("Failed to get eventfd\n"); goto err; } - ch->pfd.events = POLLIN; - ret = rbd_set_image_notification(ch->image, ch->pfd.fd, EVENT_TYPE_EVENTFD); + ret = rbd_set_image_notification(ch->image, ch->pfd, EVENT_TYPE_EVENTFD); if (ret < 0) { SPDK_ERRLOG("Failed to set rbd image notification\n"); goto err; } - ch->poller = SPDK_POLLER_REGISTER(bdev_rbd_io_poll, ch, BDEV_RBD_POLL_US); + ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&rbd_if)); + assert(ch->group_ch != NULL); + memset(&event, 0, sizeof(event)); + event.events = EPOLLIN; + event.data.ptr = ch; + + ret = epoll_ctl(ch->group_ch->epoll_fd, EPOLL_CTL_ADD, ch->pfd, &event); + if (ret < 0) { + SPDK_ERRLOG("Failed to add the fd of ch(%p) to the epoll group from group_ch=%p\n", ch, + ch->group_ch); + goto err; + } return 0; @@ -595,10 +550,16 @@ static void bdev_rbd_destroy_cb(void *io_device, void *ctx_buf) { struct bdev_rbd_io_channel *io_channel = ctx_buf; + int rc; - bdev_rbd_free_channel(io_channel); + rc = epoll_ctl(io_channel->group_ch->epoll_fd, EPOLL_CTL_DEL, + io_channel->pfd, NULL); + if (rc < 0) { + SPDK_ERRLOG("Failed to remove fd on io_channel=%p from the polling group=%p\n", + io_channel, io_channel->group_ch); + } - spdk_poller_unregister(&io_channel->poller); + bdev_rbd_free_channel(io_channel); } static struct spdk_io_channel * @@ -823,6 +784,54 @@ bdev_rbd_resize(struct spdk_bdev *bdev, const uint64_t new_size_in_mb) return rc; } +static int +bdev_rbd_group_poll(void *arg) +{ + struct bdev_rbd_group_channel *group_ch = arg; + struct epoll_event events[MAX_EVENTS_PER_POLL]; + int num_events, i; + + num_events = epoll_wait(group_ch->epoll_fd, events, MAX_EVENTS_PER_POLL, 0); + + if (num_events <= 0) { + return SPDK_POLLER_IDLE; + } + + for (i = 0; i < num_events; i++) { + bdev_rbd_io_poll((struct bdev_rbd_io_channel *)events[i].data.ptr); + } + + return SPDK_POLLER_BUSY; +} + +static int +bdev_rbd_group_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_rbd_group_channel *ch = ctx_buf; + + ch->epoll_fd = epoll_create1(0); + if (ch->epoll_fd < 0) { + SPDK_ERRLOG("Could not create epoll fd on io device=%p\n", io_device); + return -1; + } + + ch->poller = SPDK_POLLER_REGISTER(bdev_rbd_group_poll, ch, 0); + + return 0; +} + +static void +bdev_rbd_group_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_rbd_group_channel *ch = ctx_buf; + + if (ch->epoll_fd >= 0) { + close(ch->epoll_fd); + } + + spdk_poller_unregister(&ch->poller); +} + static int bdev_rbd_library_init(void) { @@ -833,9 +842,13 @@ bdev_rbd_library_init(void) struct spdk_bdev *bdev; uint32_t block_size; long int tmp; + struct spdk_conf_section *sp; - struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Ceph"); + spdk_io_device_register(&rbd_if, bdev_rbd_group_create_cb, bdev_rbd_group_destroy_cb, + sizeof(struct bdev_rbd_group_channel), + "bdev_rbd_poll_groups"); + sp = spdk_conf_find_section(NULL, "Ceph"); if (sp == NULL) { /* * Ceph section not found. Do not initialize any rbd LUNS. @@ -895,4 +908,10 @@ bdev_rbd_library_init(void) return rc; } +static void +bdev_rbd_library_fini(void) +{ + spdk_io_device_unregister(&rbd_if, NULL); +} + SPDK_LOG_REGISTER_COMPONENT("bdev_rbd", SPDK_LOG_BDEV_RBD) diff --git a/module/bdev/uring/bdev_uring.c b/module/bdev/uring/bdev_uring.c index b917d26065b..e11783ea9ea 100644 --- a/module/bdev/uring/bdev_uring.c +++ b/module/bdev/uring/bdev_uring.c @@ -244,32 +244,30 @@ bdev_uring_group_poll(void *arg) int count, ret; to_submit = group_ch->io_pending; - to_complete = group_ch->io_inflight; - ret = 0; if (to_submit > 0) { /* If there are I/O to submit, use io_uring_submit here. * It will automatically call spdk_io_uring_enter appropriately. */ ret = io_uring_submit(&group_ch->uring); + if (ret < 0) { + return SPDK_POLLER_BUSY; + } + group_ch->io_pending = 0; group_ch->io_inflight += to_submit; - } else if (to_complete > 0) { - /* If there are I/O in flight but none to submit, we need to - * call io_uring_enter ourselves. */ - ret = spdk_io_uring_enter(group_ch->uring.ring_fd, 0, 0, - IORING_ENTER_GETEVENTS); - } - - if (ret < 0) { - return 1; } + to_complete = group_ch->io_inflight; count = 0; if (to_complete > 0) { count = bdev_uring_reap(&group_ch->uring, to_complete); } - return (count + to_submit); + if (count + to_submit > 0) { + return SPDK_POLLER_BUSY; + } else { + return SPDK_POLLER_IDLE; + } } static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, @@ -422,7 +420,9 @@ bdev_uring_group_create_cb(void *io_device, void *ctx_buf) { struct bdev_uring_group_channel *ch = ctx_buf; - if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, IORING_SETUP_IOPOLL) < 0) { + /* Do not use IORING_SETUP_IOPOLL until the Linux kernel can support not only + * local devices but also devices attached from remote target */ + if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, 0) < 0) { SPDK_ERRLOG("uring I/O context setup failure\n"); return -1; } @@ -436,7 +436,6 @@ bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf) { struct bdev_uring_group_channel *ch = ctx_buf; - close(ch->uring.ring_fd); io_uring_queue_exit(&ch->uring); spdk_poller_unregister(&ch->poller); diff --git a/module/bdev/virtio/bdev_virtio_blk.c b/module/bdev/virtio/bdev_virtio_blk.c index 99653e238bc..8730f39db76 100644 --- a/module/bdev/virtio/bdev_virtio_blk.c +++ b/module/bdev/virtio/bdev_virtio_blk.c @@ -554,9 +554,7 @@ virtio_pci_blk_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx) rc = virtio_dev_reset(vdev, VIRTIO_BLK_DEV_SUPPORTED_FEATURES); if (rc != 0) { - virtio_dev_destruct(vdev); - free(bvdev); - return NULL; + goto fail; } /* TODO: add a way to limit usable virtqueues */ @@ -565,9 +563,7 @@ virtio_pci_blk_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx) &num_queues, sizeof(num_queues)); if (rc) { SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); - virtio_dev_destruct(vdev); - free(bvdev); - return NULL; + goto fail; } } else { num_queues = 1; @@ -575,12 +571,16 @@ virtio_pci_blk_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx) rc = virtio_blk_dev_init(bvdev, num_queues); if (rc != 0) { - virtio_dev_destruct(vdev); - free(bvdev); - return NULL; + goto fail; } return bvdev; + +fail: + vdev->ctx = NULL; + virtio_dev_destruct(vdev); + free(bvdev); + return NULL; } static struct virtio_blk_dev * diff --git a/module/bdev/virtio/bdev_virtio_scsi.c b/module/bdev/virtio/bdev_virtio_scsi.c index de65b15d7fa..7913595383e 100644 --- a/module/bdev/virtio/bdev_virtio_scsi.c +++ b/module/bdev/virtio/bdev_virtio_scsi.c @@ -344,19 +344,21 @@ virtio_pci_scsi_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx) &num_queues, sizeof(num_queues)); if (rc) { SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); - virtio_dev_destruct(vdev); - free(svdev); - return NULL; + goto fail; } rc = virtio_scsi_dev_init(svdev, num_queues); if (rc != 0) { - virtio_dev_destruct(vdev); - free(svdev); - return NULL; + goto fail; } return svdev; + +fail: + vdev->ctx = NULL; + virtio_dev_destruct(vdev); + free(svdev); + return NULL; } static struct virtio_scsi_dev * @@ -811,7 +813,7 @@ bdev_virtio_poll(void *arg) if (spdk_unlikely(scan_ctx && io[i] == &scan_ctx->io_ctx)) { if (svdev->removed) { _virtio_scsi_dev_scan_finish(scan_ctx, -EINTR); - return -1; + return SPDK_POLLER_BUSY; } if (scan_ctx->restart) { @@ -831,9 +833,9 @@ bdev_virtio_poll(void *arg) if (spdk_unlikely(scan_ctx && scan_ctx->needs_resend)) { if (svdev->removed) { _virtio_scsi_dev_scan_finish(scan_ctx, -EINTR); - return -1; + return SPDK_POLLER_BUSY; } else if (cnt == 0) { - return 0; + return SPDK_POLLER_IDLE; } rc = send_scan_io(scan_ctx); @@ -1967,6 +1969,7 @@ bdev_virtio_pci_scsi_dev_create_cb(struct virtio_pci_ctx *pci_ctx, void *ctx) rc = virtio_scsi_dev_scan(svdev, create_ctx->cb_fn, create_ctx->cb_arg); if (rc) { + svdev->vdev.ctx = NULL; virtio_scsi_dev_remove(svdev, NULL, NULL); } diff --git a/module/blobfs/bdev/blobfs_fuse.c b/module/blobfs/bdev/blobfs_fuse.c index df6d61e04cf..16665498a5f 100644 --- a/module/blobfs/bdev/blobfs_fuse.c +++ b/module/blobfs/bdev/blobfs_fuse.c @@ -353,6 +353,8 @@ blobfs_fuse_start(const char *bdev_name, const char *mountpoint, struct spdk_fil void blobfs_fuse_stop(struct spdk_blobfs_fuse *bfuse) { - fuse_session_exit(fuse_get_session(bfuse->fuse_handle)); - pthread_kill(bfuse->fuse_tid, SIGINT); + if (bfuse) { + fuse_session_exit(fuse_get_session(bfuse->fuse_handle)); + pthread_kill(bfuse->fuse_tid, SIGINT); + } } diff --git a/module/event/rpc/app_rpc.c b/module/event/rpc/app_rpc.c index f223c17344f..ad6264d8599 100644 --- a/module/event/rpc/app_rpc.c +++ b/module/event/rpc/app_rpc.c @@ -540,4 +540,36 @@ rpc_thread_set_cpumask(struct spdk_jsonrpc_request *request, free(ctx); } SPDK_RPC_REGISTER("thread_set_cpumask", rpc_thread_set_cpumask, SPDK_RPC_RUNTIME) + +struct rpc_log_enable_timestamps { + bool enabled; +}; + +static const struct spdk_json_object_decoder rpc_log_enable_timestamps_decoders[] = { + {"enabled", offsetof(struct rpc_log_enable_timestamps, enabled), spdk_json_decode_bool}, +}; + +static void +rpc_log_enable_timestamps(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_log_enable_timestamps req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_log_enable_timestamps_decoders, + SPDK_COUNTOF(rpc_log_enable_timestamps_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "spdk_json_decode_object failed"); + return; + } + + spdk_log_enable_timestamps(req.enabled); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("log_enable_timestamps", rpc_log_enable_timestamps, SPDK_RPC_RUNTIME) SPDK_LOG_REGISTER_COMPONENT("APP_RPC", SPDK_LOG_APP_RPC) diff --git a/module/event/subsystems/Makefile b/module/event/subsystems/Makefile index 58e47a6019d..a78985ec387 100644 --- a/module/event/subsystems/Makefile +++ b/module/event/subsystems/Makefile @@ -34,7 +34,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk -DIRS-y += bdev accel iscsi net nvmf scsi vmd +DIRS-y += bdev accel iscsi net nvmf scsi vmd sock ifeq ($(OS),Linux) DIRS-y += nbd @@ -46,7 +46,7 @@ DIRS-$(CONFIG_VHOST) += vhost # the subsystem dependency tree defined within the event subsystem C files # themselves. Should that tree change, these dependencies should change # accordingly. -DEPDIRS-bdev := accel vmd +DEPDIRS-bdev := accel vmd sock DEPDIRS-iscsi := scsi DEPDIRS-nbd := bdev DEPDIRS-nvmf := bdev diff --git a/module/event/subsystems/bdev/bdev.c b/module/event/subsystems/bdev/bdev.c index a7926f93612..5776cf273a2 100644 --- a/module/event/subsystems/bdev/bdev.c +++ b/module/event/subsystems/bdev/bdev.c @@ -81,3 +81,4 @@ static struct spdk_subsystem g_spdk_subsystem_bdev = { SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_bdev); SPDK_SUBSYSTEM_DEPEND(bdev, accel) SPDK_SUBSYSTEM_DEPEND(bdev, vmd) +SPDK_SUBSYSTEM_DEPEND(bdev, sock) diff --git a/module/event/subsystems/iscsi/iscsi.c b/module/event/subsystems/iscsi/iscsi.c index 1fa3e915ce1..cecefd0a5a3 100644 --- a/module/event/subsystems/iscsi/iscsi.c +++ b/module/event/subsystems/iscsi/iscsi.c @@ -77,3 +77,4 @@ static struct spdk_subsystem g_spdk_subsystem_iscsi = { SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_iscsi); SPDK_SUBSYSTEM_DEPEND(iscsi, scsi) +SPDK_SUBSYSTEM_DEPEND(iscsi, sock) diff --git a/module/event/subsystems/nvmf/nvmf_tgt.c b/module/event/subsystems/nvmf/nvmf_tgt.c index e305692d9d0..0f516a54515 100644 --- a/module/event/subsystems/nvmf/nvmf_tgt.c +++ b/module/event/subsystems/nvmf/nvmf_tgt.c @@ -205,9 +205,15 @@ nvmf_tgt_subsystem_started(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status) { subsystem = spdk_nvmf_subsystem_get_next(subsystem); + int rc; if (subsystem) { - spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_started, NULL); + rc = spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_started, NULL); + if (rc) { + g_tgt_state = NVMF_TGT_FINI_STOP_SUBSYSTEMS; + SPDK_ERRLOG("Unable to start NVMe-oF subsystem. Stopping app.\n"); + nvmf_tgt_advance_state(); + } return; } @@ -220,9 +226,14 @@ nvmf_tgt_subsystem_stopped(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status) { subsystem = spdk_nvmf_subsystem_get_next(subsystem); + int rc; if (subsystem) { - spdk_nvmf_subsystem_stop(subsystem, nvmf_tgt_subsystem_stopped, NULL); + rc = spdk_nvmf_subsystem_stop(subsystem, nvmf_tgt_subsystem_stopped, NULL); + if (rc) { + SPDK_ERRLOG("Unable to stop NVMe-oF subsystem. Trying others.\n"); + nvmf_tgt_subsystem_stopped(subsystem, NULL, 0); + } return; } @@ -337,6 +348,7 @@ nvmf_tgt_advance_state(void) { enum nvmf_tgt_state prev_state; int rc = -1; + int ret; do { prev_state = g_tgt_state; @@ -369,7 +381,11 @@ nvmf_tgt_advance_state(void) subsystem = spdk_nvmf_subsystem_get_first(g_spdk_nvmf_tgt); if (subsystem) { - spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_started, NULL); + ret = spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_started, NULL); + if (ret) { + SPDK_ERRLOG("Unable to start NVMe-oF subsystem. Stopping app.\n"); + g_tgt_state = NVMF_TGT_FINI_STOP_SUBSYSTEMS; + } } else { g_tgt_state = NVMF_TGT_RUNNING; } @@ -384,7 +400,10 @@ nvmf_tgt_advance_state(void) subsystem = spdk_nvmf_subsystem_get_first(g_spdk_nvmf_tgt); if (subsystem) { - spdk_nvmf_subsystem_stop(subsystem, nvmf_tgt_subsystem_stopped, NULL); + ret = spdk_nvmf_subsystem_stop(subsystem, nvmf_tgt_subsystem_stopped, NULL); + if (ret) { + nvmf_tgt_subsystem_stopped(subsystem, NULL, 0); + } } else { g_tgt_state = NVMF_TGT_FINI_DESTROY_POLL_GROUPS; } @@ -445,3 +464,4 @@ static struct spdk_subsystem g_spdk_subsystem_nvmf = { SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_nvmf) SPDK_SUBSYSTEM_DEPEND(nvmf, bdev) +SPDK_SUBSYSTEM_DEPEND(nvmf, sock) diff --git a/module/sock/vpp/Makefile b/module/event/subsystems/sock/Makefile similarity index 77% rename from module/sock/vpp/Makefile rename to module/event/subsystems/sock/Makefile index 016018c77c8..5a137d88de0 100644 --- a/module/sock/vpp/Makefile +++ b/module/event/subsystems/sock/Makefile @@ -1,8 +1,7 @@ # # BSD LICENSE # -# Copyright (c) Intel Corporation. -# All rights reserved. +# Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -31,24 +30,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk -SO_VER := 2 +SO_VER := 1 SO_MINOR := 0 -C_SRCS += vpp.c -CFLAGS += -Wno-sign-compare -Wno-error=old-style-definition -CFLAGS += -Wno-error=strict-prototypes -Wno-error=ignored-qualifiers - -GCC_VERSION=$(shell $(CC) -dumpversion | cut -d. -f1) - -# disable packed member unalign warnings -ifeq ($(shell test $(GCC_VERSION) -ge 9 && echo 1), 1) -CFLAGS += -Wno-error=address-of-packed-member -endif - -LIBNAME = sock_vpp +C_SRCS = sock.c +LIBNAME = event_sock SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map diff --git a/lib/env_dpdk/pci_nvme.c b/module/event/subsystems/sock/sock.c similarity index 64% rename from lib/env_dpdk/pci_nvme.c rename to module/event/subsystems/sock/sock.c index a21b242e753..fdcb2160a03 100644 --- a/lib/env_dpdk/pci_nvme.c +++ b/module/event/subsystems/sock/sock.c @@ -1,8 +1,7 @@ /*- * BSD LICENSE * - * Copyright (c) Intel Corporation. - * All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -31,43 +30,33 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "env_internal.h" +#include "spdk/stdinc.h" +#include "spdk/sock.h" +#include "spdk_internal/event.h" -#include "spdk/pci_ids.h" - -static struct rte_pci_id nvme_pci_driver_id[] = { - { - .class_id = SPDK_PCI_CLASS_NVME, - .vendor_id = PCI_ANY_ID, - .device_id = PCI_ANY_ID, - .subsystem_vendor_id = PCI_ANY_ID, - .subsystem_device_id = PCI_ANY_ID, - }, - { .vendor_id = 0, /* sentinel */ }, -}; - -static struct spdk_pci_driver g_nvme_pci_drv = { - .driver = { - .drv_flags = RTE_PCI_DRV_NEED_MAPPING -#if RTE_VERSION >= RTE_VERSION_NUM(18, 8, 0, 0) - | RTE_PCI_DRV_WC_ACTIVATE -#endif - , - .id_table = nvme_pci_driver_id, - .probe = pci_device_init, - .remove = pci_device_fini, - .driver.name = "spdk_nvme", - }, +static void +sock_subsystem_init(void) +{ + spdk_subsystem_init_next(0); +} - .cb_fn = NULL, - .cb_arg = NULL, - .is_registered = false, -}; +static void +sock_subsystem_fini(void) +{ + spdk_subsystem_fini_next(); +} -struct spdk_pci_driver * -spdk_pci_nvme_get_driver(void) +static void +sock_subsystem_write_config_json(struct spdk_json_write_ctx *w) { - return &g_nvme_pci_drv; + spdk_sock_write_config_json(w); } -SPDK_PMD_REGISTER_PCI(g_nvme_pci_drv); +static struct spdk_subsystem g_spdk_subsystem_sock = { + .name = "sock", + .init = sock_subsystem_init, + .fini = sock_subsystem_fini, + .write_config_json = sock_subsystem_write_config_json, +}; + +SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_sock); diff --git a/module/sock/Makefile b/module/sock/Makefile index 865743d062a..f8df350abb9 100644 --- a/module/sock/Makefile +++ b/module/sock/Makefile @@ -38,7 +38,6 @@ DIRS-y = posix ifeq ($(OS), Linux) DIRS-$(CONFIG_URING) += uring endif -DIRS-$(CONFIG_VPP) += vpp .PHONY: all clean $(DIRS-y) diff --git a/module/sock/posix/posix.c b/module/sock/posix/posix.c index 7057cbfc78d..78a146cc7b7 100644 --- a/module/sock/posix/posix.c +++ b/module/sock/posix/posix.c @@ -49,8 +49,6 @@ #define MAX_TMPBUF 1024 #define PORTNUMLEN 32 -#define MIN_SO_RCVBUF_SIZE (2 * 1024 * 1024) -#define MIN_SO_SNDBUF_SIZE (2 * 1024 * 1024) #define IOV_BATCH_SIZE 64 #if defined(SO_ZEROCOPY) && defined(MSG_ZEROCOPY) @@ -68,6 +66,7 @@ struct spdk_posix_sock { void *recv_buf; int recv_buf_sz; bool pending_recv; + int so_priority; TAILQ_ENTRY(spdk_posix_sock) link; }; @@ -80,7 +79,11 @@ struct spdk_posix_sock_group_impl { static struct spdk_sock_impl_opts g_spdk_posix_sock_impl_opts = { .recv_buf_size = MIN_SO_RCVBUF_SIZE, - .send_buf_size = MIN_SO_SNDBUF_SIZE + .send_buf_size = MIN_SO_SNDBUF_SIZE, + .enable_recv_pipe = true, + .enable_zerocopy_send = true, + .enable_quickack = false, + .enable_placement_id = false, }; static int @@ -266,9 +269,11 @@ posix_sock_set_recvbuf(struct spdk_sock *_sock, int sz) assert(sock != NULL); - rc = posix_sock_alloc_pipe(sock, sz); - if (rc) { - return rc; + if (g_spdk_posix_sock_impl_opts.enable_recv_pipe) { + rc = posix_sock_alloc_pipe(sock, sz); + if (rc) { + return rc; + } } /* Set kernel buffer size to be at least MIN_SO_RCVBUF_SIZE */ @@ -308,9 +313,9 @@ static struct spdk_posix_sock * posix_sock_alloc(int fd, bool enable_zero_copy) { struct spdk_posix_sock *sock; -#ifdef SPDK_ZEROCOPY - int rc; +#if defined(SPDK_ZEROCOPY) || defined(__linux__) int flag; + int rc; #endif sock = calloc(1, sizeof(*sock)); @@ -321,19 +326,31 @@ posix_sock_alloc(int fd, bool enable_zero_copy) sock->fd = fd; -#ifdef SPDK_ZEROCOPY - if (!enable_zero_copy) { +#if defined(SPDK_ZEROCOPY) + flag = 1; + + if (!enable_zero_copy || !g_spdk_posix_sock_impl_opts.enable_zerocopy_send) { return sock; } /* Try to turn on zero copy sends */ - flag = 1; rc = setsockopt(sock->fd, SOL_SOCKET, SO_ZEROCOPY, &flag, sizeof(flag)); if (rc == 0) { sock->zcopy = true; } #endif +#if defined(__linux__) + flag = 1; + + if (g_spdk_posix_sock_impl_opts.enable_quickack) { + rc = setsockopt(sock->fd, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(flag)); + if (rc != 0) { + SPDK_ERRLOG("quickack was failed to set\n"); + } + } +#endif + return sock; } @@ -422,7 +439,7 @@ posix_sock_create(const char *ip, int port, hints.ai_flags |= AI_NUMERICHOST; rc = getaddrinfo(ip, portnum, &hints, &res0); if (rc != 0) { - SPDK_ERRLOG("getaddrinfo() failed (errno=%d)\n", errno); + SPDK_ERRLOG("getaddrinfo() failed %s (%d)\n", gai_strerror(rc), rc); return NULL; } @@ -552,6 +569,9 @@ posix_sock_create(const char *ip, int port, return NULL; } + if (opts != NULL) { + sock->so_priority = opts->priority; + } return &sock->base; } @@ -614,6 +634,7 @@ posix_sock_accept(struct spdk_sock *_sock) close(fd); return NULL; } + new_sock->so_priority = sock->base.opts.priority; return &new_sock->base; } @@ -780,13 +801,19 @@ _sock_flush(struct spdk_sock *sock) } rc = sendmsg(psock->fd, &msg, flags); if (rc <= 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) { + if (errno == EAGAIN || errno == EWOULDBLOCK || (errno == ENOBUFS && psock->zcopy)) { return 0; } return rc; } - psock->sendmsg_idx++; + /* Handling overflow case, because we use psock->sendmsg_idx - 1 for the + * req->internal.offset, so sendmsg_idx should not be zero */ + if (spdk_unlikely(psock->sendmsg_idx == UINT32_MAX)) { + psock->sendmsg_idx = 1; + } else { + psock->sendmsg_idx++; + } /* Consume the requests that were actually written */ req = TAILQ_FIRST(&sock->queued_reqs); @@ -1077,6 +1104,10 @@ posix_sock_get_placement_id(struct spdk_sock *_sock, int *placement_id) { int rc = -1; + if (!g_spdk_posix_sock_impl_opts.enable_placement_id) { + return rc; + } + #if defined(SO_INCOMING_NAPI_ID) struct spdk_posix_sock *sock = __posix_sock(_sock); socklen_t salen = sizeof(int); @@ -1225,6 +1256,17 @@ posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, if (num_events == -1) { return -1; + } else if (num_events == 0 && !TAILQ_EMPTY(&_group->socks)) { + uint8_t byte; + + sock = TAILQ_FIRST(&_group->socks); + psock = __posix_sock(sock); + /* a recv is done here to busy poll the queue associated with + * first socket in list and potentially reap incoming data. + */ + if (psock->so_priority) { + recv(psock->fd, &byte, 1, MSG_PEEK); + } } for (i = 0; i < num_events; i++) { @@ -1309,13 +1351,19 @@ posix_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len) #define FIELD_OK(field) \ offsetof(struct spdk_sock_impl_opts, field) + sizeof(opts->field) <= *len - if (FIELD_OK(recv_buf_size)) { - opts->recv_buf_size = g_spdk_posix_sock_impl_opts.recv_buf_size; - } - if (FIELD_OK(send_buf_size)) { - opts->send_buf_size = g_spdk_posix_sock_impl_opts.send_buf_size; +#define GET_FIELD(field) \ + if (FIELD_OK(field)) { \ + opts->field = g_spdk_posix_sock_impl_opts.field; \ } + GET_FIELD(recv_buf_size); + GET_FIELD(send_buf_size); + GET_FIELD(enable_recv_pipe); + GET_FIELD(enable_zerocopy_send); + GET_FIELD(enable_quickack); + GET_FIELD(enable_placement_id); + +#undef GET_FIELD #undef FIELD_OK *len = spdk_min(*len, sizeof(g_spdk_posix_sock_impl_opts)); @@ -1333,13 +1381,19 @@ posix_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len) #define FIELD_OK(field) \ offsetof(struct spdk_sock_impl_opts, field) + sizeof(opts->field) <= len - if (FIELD_OK(recv_buf_size)) { - g_spdk_posix_sock_impl_opts.recv_buf_size = opts->recv_buf_size; - } - if (FIELD_OK(send_buf_size)) { - g_spdk_posix_sock_impl_opts.send_buf_size = opts->send_buf_size; +#define SET_FIELD(field) \ + if (FIELD_OK(field)) { \ + g_spdk_posix_sock_impl_opts.field = opts->field; \ } + SET_FIELD(recv_buf_size); + SET_FIELD(send_buf_size); + SET_FIELD(enable_recv_pipe); + SET_FIELD(enable_zerocopy_send); + SET_FIELD(enable_quickack); + SET_FIELD(enable_placement_id); + +#undef SET_FIELD #undef FIELD_OK return 0; diff --git a/module/sock/uring/uring.c b/module/sock/uring/uring.c index b186420c881..447cccc9ff1 100644 --- a/module/sock/uring/uring.c +++ b/module/sock/uring/uring.c @@ -50,8 +50,6 @@ #define MAX_TMPBUF 1024 #define PORTNUMLEN 32 -#define SO_RCVBUF_SIZE (2 * 1024 * 1024) -#define SO_SNDBUF_SIZE (2 * 1024 * 1024) #define SPDK_SOCK_GROUP_QUEUE_DEPTH 4096 #define IOV_BATCH_SIZE 64 @@ -101,6 +99,14 @@ struct spdk_uring_sock_group_impl { TAILQ_HEAD(, spdk_uring_sock) pending_recv; }; +static struct spdk_sock_impl_opts g_spdk_uring_sock_impl_opts = { + .recv_buf_size = MIN_SO_RCVBUF_SIZE, + .send_buf_size = MIN_SO_SNDBUF_SIZE, + .enable_recv_pipe = true, + .enable_quickack = false, + .enable_placement_id = false, +}; + #define SPDK_URING_SOCK_REQUEST_IOV(req) ((struct iovec *)((uint8_t *)req + sizeof(struct spdk_sock_request))) static int @@ -286,18 +292,16 @@ uring_sock_set_recvbuf(struct spdk_sock *_sock, int sz) assert(sock != NULL); -#ifndef __aarch64__ - /* On ARM systems, this buffering does not help. Skip it. */ - /* The size of the pipe is purely derived from benchmarks. It seems to work well. */ - rc = uring_sock_alloc_pipe(sock, sz); - if (rc) { - SPDK_ERRLOG("unable to allocate sufficient recvbuf with sz=%d on sock=%p\n", sz, _sock); - return rc; + if (g_spdk_uring_sock_impl_opts.enable_recv_pipe) { + rc = uring_sock_alloc_pipe(sock, sz); + if (rc) { + SPDK_ERRLOG("unable to allocate sufficient recvbuf with sz=%d on sock=%p\n", sz, _sock); + return rc; + } } -#endif - if (sz < SO_RCVBUF_SIZE) { - sz = SO_RCVBUF_SIZE; + if (sz < MIN_SO_RCVBUF_SIZE) { + sz = MIN_SO_RCVBUF_SIZE; } rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); @@ -316,8 +320,8 @@ uring_sock_set_sendbuf(struct spdk_sock *_sock, int sz) assert(sock != NULL); - if (sz < SO_SNDBUF_SIZE) { - sz = SO_SNDBUF_SIZE; + if (sz < MIN_SO_SNDBUF_SIZE) { + sz = MIN_SO_SNDBUF_SIZE; } rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); @@ -332,6 +336,10 @@ static struct spdk_uring_sock * uring_sock_alloc(int fd) { struct spdk_uring_sock *sock; +#if defined(__linux__) + int flag; + int rc; +#endif sock = calloc(1, sizeof(*sock)); if (sock == NULL) { @@ -340,6 +348,17 @@ uring_sock_alloc(int fd) } sock->fd = fd; + +#if defined(__linux__) + flag = 1; + + if (g_spdk_uring_sock_impl_opts.enable_quickack) { + rc = setsockopt(sock->fd, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(flag)); + if (rc != 0) { + SPDK_ERRLOG("quickack was failed to set\n"); + } + } +#endif return sock; } @@ -392,13 +411,13 @@ uring_sock_create(const char *ip, int port, continue; } - val = SO_RCVBUF_SIZE; + val = g_spdk_uring_sock_impl_opts.recv_buf_size; rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof val); if (rc) { /* Not fatal */ } - val = SO_SNDBUF_SIZE; + val = g_spdk_uring_sock_impl_opts.send_buf_size; rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &val, sizeof val); if (rc) { /* Not fatal */ @@ -925,8 +944,6 @@ sock_uring_group_reap(struct spdk_uring_sock_group_impl *group, int max, int max sock->pending_recv = true; TAILQ_INSERT_TAIL(&group->pending_recv, sock, link); } - } else { - SPDK_UNREACHABLE(); } break; case SPDK_SOCK_TASK_WRITE: @@ -1124,6 +1141,10 @@ uring_sock_get_placement_id(struct spdk_sock *_sock, int *placement_id) { int rc = -1; + if (!g_spdk_uring_sock_impl_opts.enable_placement_id) { + return rc; + } + #if defined(SO_INCOMING_NAPI_ID) struct spdk_uring_sock *sock = __uring_sock(_sock); socklen_t salen = sizeof(int); @@ -1199,13 +1220,15 @@ uring_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, struct spdk_sock *_sock, *tmp; struct spdk_uring_sock *sock; - TAILQ_FOREACH_SAFE(_sock, &group->base.socks, link, tmp) { - sock = __uring_sock(_sock); - if (spdk_unlikely(sock->connection_status)) { - continue; + if (spdk_likely(socks)) { + TAILQ_FOREACH_SAFE(_sock, &group->base.socks, link, tmp) { + sock = __uring_sock(_sock); + if (spdk_unlikely(sock->connection_status)) { + continue; + } + _sock_flush(_sock); + _sock_prep_pollin(_sock); } - _sock_flush(_sock); - _sock_prep_pollin(_sock); } to_submit = group->io_queued; @@ -1281,13 +1304,69 @@ uring_sock_group_impl_close(struct spdk_sock_group_impl *_group) assert(group->io_inflight == 0); assert(group->io_avail == SPDK_SOCK_GROUP_QUEUE_DEPTH); - close(group->uring.ring_fd); io_uring_queue_exit(&group->uring); free(group); return 0; } +static int +uring_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len) +{ + if (!opts || !len) { + errno = EINVAL; + return -1; + } + +#define FIELD_OK(field) \ + offsetof(struct spdk_sock_impl_opts, field) + sizeof(opts->field) <= *len + +#define GET_FIELD(field) \ + if (FIELD_OK(field)) { \ + opts->field = g_spdk_uring_sock_impl_opts.field; \ + } + + GET_FIELD(recv_buf_size); + GET_FIELD(send_buf_size); + GET_FIELD(enable_recv_pipe); + GET_FIELD(enable_quickack); + GET_FIELD(enable_placement_id); + +#undef GET_FIELD +#undef FIELD_OK + + *len = spdk_min(*len, sizeof(g_spdk_uring_sock_impl_opts)); + return 0; +} + +static int +uring_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len) +{ + if (!opts) { + errno = EINVAL; + return -1; + } + +#define FIELD_OK(field) \ + offsetof(struct spdk_sock_impl_opts, field) + sizeof(opts->field) <= len + +#define SET_FIELD(field) \ + if (FIELD_OK(field)) { \ + g_spdk_uring_sock_impl_opts.field = opts->field; \ + } + + SET_FIELD(recv_buf_size); + SET_FIELD(send_buf_size); + SET_FIELD(enable_recv_pipe); + SET_FIELD(enable_quickack); + SET_FIELD(enable_placement_id); + +#undef SET_FIELD +#undef FIELD_OK + + return 0; +} + static int uring_sock_flush(struct spdk_sock *_sock) { @@ -1324,6 +1403,8 @@ static struct spdk_net_impl g_uring_net_impl = { .group_impl_remove_sock = uring_sock_group_impl_remove_sock, .group_impl_poll = uring_sock_group_impl_poll, .group_impl_close = uring_sock_group_impl_close, + .get_opts = uring_sock_impl_get_opts, + .set_opts = uring_sock_impl_set_opts, }; SPDK_NET_IMPL_REGISTER(uring, &g_uring_net_impl, DEFAULT_SOCK_PRIORITY + 1); diff --git a/module/sock/vpp/vpp.c b/module/sock/vpp/vpp.c deleted file mode 100644 index 237c65fbb97..00000000000 --- a/module/sock/vpp/vpp.c +++ /dev/null @@ -1,1633 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright (c) Intel Corporation. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* Omit from static analysis. */ -#ifndef __clang_analyzer__ - -#include "spdk/stdinc.h" - -#include "spdk/log.h" -#include "spdk/sock.h" -#include "spdk/net.h" -#include "spdk/string.h" -#include "spdk_internal/sock.h" -#include "spdk/queue.h" -#include "spdk/event.h" -#include "spdk/thread.h" -#include "spdk_internal/log.h" - -/* _GNU_SOURCE is redefined in the vpp headers with no protection (dlmalloc.h) */ -#undef _GNU_SOURCE - -#include -#include -#include -#include - -#define vl_typedefs /* define message structures */ -#include -#undef vl_typedefs - -/* declare message handlers for each api */ - -#define vl_endianfun /* define message structures */ -#include -#undef vl_endianfun - -/* instantiate all the print functions we know about */ -#define vl_print(handle, ...) -#define vl_printfun -#include -#undef vl_printfun - -#define SPDK_VPP_CLIB_MEM_SIZE 256 << 20 -#define SPDK_VPP_SESSIONS_MAX 2048 -#define SPDK_VPP_LISTEN_QUEUE_SIZE SPDK_VPP_SESSIONS_MAX -#define SPDK_VPP_SEGMENT_BASEVA 0x200000000ULL -#define SPDK_VPP_SEGMENT_TIMEOUT 20 -#define IOV_BATCH_SIZE 64 - -/* VPP connection state */ -enum spdk_vpp_state { - VPP_STATE_START, - VPP_STATE_ENABLED, - VPP_STATE_ATTACHED, - VPP_STATE_READY, - VPP_STATE_DISCONNECTING, - VPP_STATE_FAILED -}; - -/* VPP session state */ -enum spdk_vpp_session_state { - VPP_SESSION_STATE_UNUSED = 0, - VPP_SESSION_STATE_INIT, /* Initial state */ - VPP_SESSION_STATE_READY, /* Ready for processing */ - VPP_SESSION_STATE_DISCONNECT, - VPP_SESSION_STATE_CLOSE, - VPP_SESSION_STATE_FAILED -}; - -struct spdk_vpp_session { - struct spdk_sock base; - - /* VPP app session */ - app_session_t app_session; - - uint32_t id; - - bool is_server; /* Server side session */ - bool is_listen; /* Session is listener */ - - uint64_t handle; - uint32_t context; - - /* Listener fields */ - pthread_mutex_t accept_session_lock; - uint32_t *accept_session_index_fifo; -}; - -static struct spdk_vpp_main { - int my_client_index; - enum spdk_vpp_state vpp_state; - bool vpp_initialized; - struct spdk_thread *init_thread; - - svm_fifo_segment_main_t segment_main; - svm_queue_t *vl_input_queue; - svm_queue_t *vl_output_queue; - svm_msg_q_t *app_event_queue; - - struct spdk_vpp_session sessions[SPDK_VPP_SESSIONS_MAX]; - pthread_mutex_t session_get_lock; - - struct spdk_poller *vpp_queue_poller; - struct spdk_poller *app_queue_poller; - struct spdk_poller *timeout_poller; -} g_svm; - -struct spdk_vpp_sock_group_impl { - struct spdk_sock_group_impl base; - struct spdk_sock *last_sock; -}; - -#define __vpp_session(sock) ((struct spdk_vpp_session *)sock) -#define __vpp_group_impl(group) ((struct spdk_vpp_sock_group_impl *)group) - -/****************************************************************************** - * Session management - */ -static struct spdk_vpp_session * -vpp_session_create(void) -{ - struct spdk_vpp_session *session; - int i; - - pthread_mutex_lock(&g_svm.session_get_lock); - for (i = 0; i < SPDK_VPP_SESSIONS_MAX && - g_svm.sessions[i].app_session.session_state != VPP_SESSION_STATE_UNUSED; i++) { - /* Empty loop body */ - } - if (i == SPDK_VPP_SESSIONS_MAX || - g_svm.sessions[i].app_session.session_state != VPP_SESSION_STATE_UNUSED) { - SPDK_ERRLOG("Cannot allocate space for new session\n"); - pthread_mutex_unlock(&g_svm.session_get_lock); - return NULL; - } - session = &g_svm.sessions[i]; - memset(session, 0, sizeof(struct spdk_vpp_session)); - pthread_mutex_init(&session->accept_session_lock, NULL); - - session->id = i; - session->app_session.session_state = VPP_SESSION_STATE_INIT; - - pthread_mutex_unlock(&g_svm.session_get_lock); - - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Creating new session %p (%d)\n", - session, session->id); - - return session; -} - -static struct spdk_vpp_session * -vpp_session_get(uint32_t id) -{ - struct spdk_vpp_session *session = NULL; - - if (id >= SPDK_VPP_SESSIONS_MAX) { - return NULL; - } - - pthread_mutex_lock(&g_svm.session_get_lock); - if (g_svm.sessions[id].app_session.session_state != VPP_SESSION_STATE_UNUSED) { - session = &g_svm.sessions[id]; - } - pthread_mutex_unlock(&g_svm.session_get_lock); - - return session; -} - -static struct spdk_vpp_session * -vpp_session_get_by_handle(uint64_t handle, bool is_listen) -{ - struct spdk_vpp_session *session = NULL; - int i; - - for (i = 0; i < SPDK_VPP_SESSIONS_MAX; i++) { - if (g_svm.sessions[i].app_session.session_state != VPP_SESSION_STATE_UNUSED && - g_svm.sessions[i].app_session.session_state != VPP_SESSION_STATE_DISCONNECT && - g_svm.sessions[i].handle == handle && - g_svm.sessions[i].is_listen == is_listen) { - session = &g_svm.sessions[i]; - break; - } - } - - return session; -} - -static int -vpp_session_free(struct spdk_vpp_session *session) -{ - /* Remove session */ - if (session == NULL) { - SPDK_ERRLOG("Wrong session\n"); - return -EINVAL; - } - - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Free session %p (%d)\n", session, session->id); - - pthread_mutex_lock(&g_svm.session_get_lock); - session->app_session.session_state = VPP_SESSION_STATE_UNUSED; - pthread_mutex_destroy(&session->accept_session_lock); - pthread_mutex_unlock(&g_svm.session_get_lock); - - return 0; -} - -static int -vpp_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport, - char *caddr, int clen, uint16_t *cport) -{ - struct spdk_vpp_session *session = __vpp_session(_sock); - const char *result = NULL; - - assert(session != NULL); - assert(g_svm.vpp_initialized); - - if (session->app_session.transport.is_ip4) { - result = inet_ntop(AF_INET, &session->app_session.transport.lcl_ip.ip4.as_u8, - saddr, slen); - } else { - result = inet_ntop(AF_INET6, &session->app_session.transport.lcl_ip.ip6.as_u8, - saddr, slen); - } - if (result == NULL) { - return -1; - } - - if (sport) { - *sport = ntohs(session->app_session.transport.lcl_port); - } - - if (session->app_session.transport.is_ip4) { - result = inet_ntop(AF_INET, &session->app_session.transport.rmt_ip.ip4.as_u8, - caddr, clen); - } else { - result = inet_ntop(AF_INET6, &session->app_session.transport.rmt_ip.ip6.as_u8, - caddr, clen); - } - if (result == NULL) { - return -1; - } - - if (cport) { - *cport = ntohs(session->app_session.transport.rmt_port); - } - - return 0; -} - -enum spdk_vpp_create_type { - SPDK_SOCK_CREATE_LISTEN, - SPDK_SOCK_CREATE_CONNECT, -}; - -/****************************************************************************** - * VPP message handlers - */ -static void -session_accepted_handler(session_accepted_msg_t *mp) -{ - svm_fifo_t *rx_fifo, *tx_fifo; - struct spdk_vpp_session *client_session, *listen_session; - - pthread_mutex_lock(&g_svm.session_get_lock); - listen_session = vpp_session_get_by_handle(mp->listener_handle, true); - pthread_mutex_unlock(&g_svm.session_get_lock); - if (!listen_session) { - SPDK_ERRLOG("Listener not found\n"); - return; - } - - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Listeners handle is %" PRIu64 "\n", mp->listener_handle); - - /* Allocate local session for a client and set it up */ - client_session = vpp_session_create(); - if (client_session == NULL) { - SPDK_ERRLOG("Cannot create new session\n"); - return; - } - - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Accept session %p (%d) on %p (%d/%" PRIu64 ")\n", - client_session, client_session->id, listen_session, listen_session->id, - listen_session->handle); - - rx_fifo = uword_to_pointer(mp->server_rx_fifo, svm_fifo_t *); - rx_fifo->client_session_index = client_session->id; - tx_fifo = uword_to_pointer(mp->server_tx_fifo, svm_fifo_t *); - tx_fifo->client_session_index = client_session->id; - - client_session->handle = mp->handle; - client_session->context = mp->context; - client_session->app_session.rx_fifo = rx_fifo; - client_session->app_session.tx_fifo = tx_fifo; - client_session->app_session.vpp_evt_q = uword_to_pointer(mp->vpp_event_queue_address, - svm_msg_q_t *); - - client_session->is_server = true; - client_session->app_session.transport.rmt_port = mp->port; - client_session->app_session.transport.is_ip4 = mp->is_ip4; - memcpy(&client_session->app_session.transport.rmt_ip, mp->ip, sizeof(mp->ip)); - - client_session->app_session.transport.lcl_port = listen_session->app_session.transport.lcl_port; - memcpy(&client_session->app_session.transport.lcl_ip, &listen_session->app_session.transport.lcl_ip, - sizeof(listen_session->app_session.transport.lcl_ip)); - client_session->app_session.transport.is_ip4 = listen_session->app_session.transport.is_ip4; - - client_session->app_session.session_state = VPP_SESSION_STATE_READY; - - pthread_mutex_lock(&listen_session->accept_session_lock); - - clib_fifo_add1(listen_session->accept_session_index_fifo, - client_session->id); - - pthread_mutex_unlock(&listen_session->accept_session_lock); -} - -static void -session_connected_handler(session_connected_msg_t *mp) -{ - struct spdk_vpp_session *session; - svm_fifo_t *rx_fifo, *tx_fifo; - - session = vpp_session_get(mp->context); - if (session == NULL) { - return; - } - - if (mp->retval) { - SPDK_ERRLOG("Connection failed (%d).\n", ntohl(mp->retval)); - session->app_session.session_state = VPP_SESSION_STATE_FAILED; - return; - } - - session->app_session.vpp_evt_q = uword_to_pointer(mp->vpp_event_queue_address, - svm_msg_q_t *); - - rx_fifo = uword_to_pointer(mp->server_rx_fifo, svm_fifo_t *); - rx_fifo->client_session_index = session->id; - tx_fifo = uword_to_pointer(mp->server_tx_fifo, svm_fifo_t *); - tx_fifo->client_session_index = session->id; - - session->app_session.rx_fifo = rx_fifo; - session->app_session.tx_fifo = tx_fifo; - session->handle = mp->handle; - - /* Set lcl addr */ - session->app_session.transport.is_ip4 = mp->is_ip4; - memcpy(&session->app_session.transport.lcl_ip, mp->lcl_ip, sizeof(mp->lcl_ip)); - session->app_session.transport.lcl_port = mp->lcl_port; - - session->app_session.session_state = VPP_SESSION_STATE_READY; -} - -static void -session_disconnected_handler(session_disconnected_msg_t *mp) -{ - struct spdk_vpp_session *session = 0; - - pthread_mutex_lock(&g_svm.session_get_lock); - session = vpp_session_get_by_handle(mp->handle, false); - if (session == NULL) { - SPDK_ERRLOG("Session with handle=%" PRIu64 " not found.\n", - mp->handle); - pthread_mutex_unlock(&g_svm.session_get_lock); - return; - } - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Disconnect session %p (%d) handler\n", session, session->id); - - /* We need to postpone session deletion to inform upper layer */ - session->app_session.session_state = VPP_SESSION_STATE_DISCONNECT; - pthread_mutex_unlock(&g_svm.session_get_lock); -} - -static void -session_reset_handler(session_reset_msg_t *mp) -{ - int rv = 0; - struct spdk_vpp_session *session = NULL; - app_session_evt_t app_evt; - session_reset_reply_msg_t *rmp; - - pthread_mutex_lock(&g_svm.session_get_lock); - session = vpp_session_get_by_handle(mp->handle, false); - if (session == NULL) { - SPDK_ERRLOG("Session with handle=%" PRIu64 " not found.\n", - mp->handle); - pthread_mutex_unlock(&g_svm.session_get_lock); - return; - } - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Reset session %p (%d) handler\n", session, session->id); - - session->app_session.session_state = VPP_SESSION_STATE_DISCONNECT; - pthread_mutex_unlock(&g_svm.session_get_lock); - - app_alloc_ctrl_evt_to_vpp(session->app_session.vpp_evt_q, &app_evt, - SESSION_CTRL_EVT_RESET_REPLY); - rmp = (session_reset_reply_msg_t *) app_evt.evt->data; - rmp->retval = rv; - rmp->handle = mp->handle; - app_send_ctrl_evt_to_vpp(session->app_session.vpp_evt_q, &app_evt); -} - -static void -session_bound_handler(session_bound_msg_t *mp) -{ - struct spdk_vpp_session *session; - - /* Context should be set to the session index */ - session = vpp_session_get(mp->context); - - if (mp->retval) { - SPDK_ERRLOG("Bind failed (%d).\n", ntohl(mp->retval)); - session->app_session.session_state = VPP_SESSION_STATE_FAILED; - return; - } - - /* Set local address */ - session->app_session.transport.is_ip4 = mp->lcl_is_ip4; - memcpy(&session->app_session.transport.lcl_ip, mp->lcl_ip, sizeof(mp->lcl_ip)); - session->app_session.transport.lcl_port = mp->lcl_port; - - /* Register listener */ - session->handle = mp->handle; - - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Bind session %p (%d/%" PRIu64 ")\n", - session, session->id, session->handle); - - /* Session binded, set listen state */ - session->is_listen = true; - session->app_session.session_state = VPP_SESSION_STATE_READY; -} - -static void -session_unlisten_reply_handler(session_unlisten_reply_msg_t *mp) -{ - struct spdk_vpp_session *session; - - if (mp->retval != 0) { - SPDK_ERRLOG("Cannot unbind socket\n"); - return; - } - - session = vpp_session_get(mp->context); - if (session == NULL) { - SPDK_ERRLOG("Cannot find a session by context\n"); - return; - } - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Unbind session %p (%d)\n", session, session->id); - - session->app_session.session_state = VPP_SESSION_STATE_CLOSE; -} - -static void -handle_mq_event(session_event_t *e) -{ - switch (e->event_type) { - case SESSION_CTRL_EVT_BOUND: - session_bound_handler((session_bound_msg_t *) e->data); - break; - case SESSION_CTRL_EVT_ACCEPTED: - session_accepted_handler((session_accepted_msg_t *) e->data); - break; - case SESSION_CTRL_EVT_CONNECTED: - session_connected_handler((session_connected_msg_t *) e->data); - break; - case SESSION_CTRL_EVT_DISCONNECTED: - session_disconnected_handler((session_disconnected_msg_t *) e->data); - break; - case SESSION_CTRL_EVT_RESET: - session_reset_handler((session_reset_msg_t *) e->data); - break; - case SESSION_CTRL_EVT_UNLISTEN_REPLY: - session_unlisten_reply_handler((session_unlisten_reply_msg_t *) e->data); - break; - default: - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Unhandled event %u\n", e->event_type); - } -} - -static int -vpp_queue_poller(void *ctx) -{ - uword msg; - - if (g_svm.vl_output_queue->cursize > 0 && - !svm_queue_sub_raw(g_svm.vl_output_queue, (u8 *)&msg)) { - vl_msg_api_handler((void *)msg); - } - - return 0; -} - -static int -app_queue_poller(void *ctx) -{ - session_event_t *e; - svm_msg_q_msg_t msg; - - if (!svm_msg_q_is_empty(g_svm.app_event_queue)) { - svm_msg_q_sub(g_svm.app_event_queue, &msg, SVM_Q_WAIT, 0); - e = svm_msg_q_msg_data(g_svm.app_event_queue, &msg); - handle_mq_event(e); - svm_msg_q_free_msg(g_svm.app_event_queue, &msg); - } - return 0; -} - -/* This is required until sock.c API changes to asynchronous */ -static int -_wait_for_session_state_change(struct spdk_vpp_session *session, enum spdk_vpp_session_state state) -{ - time_t start = time(NULL); - while (time(NULL) - start < 10) { - if (session->app_session.session_state == VPP_SESSION_STATE_FAILED) { - errno = EADDRNOTAVAIL; - return -1; - } - if (session->app_session.session_state == state) { - errno = 0; - return 0; - } - if (spdk_get_thread() == g_svm.init_thread) { - usleep(100000); - app_queue_poller(NULL); - vpp_queue_poller(NULL); - } - } - /* timeout */ - errno = ETIMEDOUT; - return -1; -} - -static int -vpp_session_connect(struct spdk_vpp_session *session) -{ - vl_api_connect_sock_t *cmp; - - cmp = vl_msg_api_alloc(sizeof(*cmp)); - if (cmp == NULL) { - return -ENOMEM; - } - memset(cmp, 0, sizeof(*cmp)); - - cmp->_vl_msg_id = ntohs(VL_API_CONNECT_SOCK); - cmp->client_index = g_svm.my_client_index; - cmp->context = session->id; - - cmp->vrf = 0 /* VPPCOM_VRF_DEFAULT */; - cmp->is_ip4 = (session->app_session.transport.is_ip4); - memcpy(cmp->ip, &session->app_session.transport.rmt_ip, sizeof(cmp->ip)); - cmp->port = session->app_session.transport.rmt_port; - cmp->proto = TRANSPORT_PROTO_TCP; - vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&cmp); - - return _wait_for_session_state_change(session, VPP_SESSION_STATE_READY); -} - -static void -vl_api_disconnect_session_reply_t_handler(vl_api_disconnect_session_reply_t *mp) -{ - struct spdk_vpp_session *session; - - if (mp->retval) { - SPDK_ERRLOG("Disconnecting session failed (%d).\n", ntohl(mp->retval)); - return; - } - - pthread_mutex_lock(&g_svm.session_get_lock); - session = vpp_session_get_by_handle(mp->handle, false); - if (session == NULL) { - SPDK_ERRLOG("Invalid session handler (%" PRIu64 ").\n", mp->handle); - pthread_mutex_unlock(&g_svm.session_get_lock); - return; - } - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Session disconnected %p (%d)\n", session, session->id); - session->app_session.session_state = VPP_SESSION_STATE_CLOSE; - pthread_mutex_unlock(&g_svm.session_get_lock); -} - -static int -vpp_session_disconnect(struct spdk_vpp_session *session) -{ - int rv = 0; - vl_api_disconnect_session_t *dmp; - session_disconnected_reply_msg_t *rmp; - app_session_evt_t app_evt; - - if (session->app_session.session_state == VPP_SESSION_STATE_DISCONNECT) { - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Session is already in disconnecting state %p (%d)\n", - session, session->id); - - app_alloc_ctrl_evt_to_vpp(session->app_session.vpp_evt_q, &app_evt, - SESSION_CTRL_EVT_DISCONNECTED_REPLY); - rmp = (session_disconnected_reply_msg_t *) app_evt.evt->data; - rmp->retval = rv; - rmp->handle = session->handle; - rmp->context = session->context; - app_send_ctrl_evt_to_vpp(session->app_session.vpp_evt_q, &app_evt); - - return 0; - } - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Disconnect session %p (%d)\n", session, session->id); - - dmp = vl_msg_api_alloc(sizeof(*dmp)); - if (dmp == NULL) { - return -ENOMEM; - } - memset(dmp, 0, sizeof(*dmp)); - dmp->_vl_msg_id = ntohs(VL_API_DISCONNECT_SESSION); - dmp->client_index = g_svm.my_client_index; - dmp->handle = session->handle; - vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&dmp); - - return _wait_for_session_state_change(session, VPP_SESSION_STATE_CLOSE); -} - -static int -send_unbind_sock(struct spdk_vpp_session *session) -{ - vl_api_unbind_sock_t *ump; - - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Unbind session %p (%d) request\n", session, session->id); - - ump = vl_msg_api_alloc(sizeof(*ump)); - if (ump == NULL) { - return -ENOMEM; - } - memset(ump, 0, sizeof(*ump)); - - ump->_vl_msg_id = ntohs(VL_API_UNBIND_SOCK); - ump->client_index = g_svm.my_client_index; - ump->handle = session->handle; - ump->context = session->id; - vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&ump); - - return _wait_for_session_state_change(session, VPP_SESSION_STATE_CLOSE); -} - -static int -vpp_session_listen(struct spdk_vpp_session *session) -{ - vl_api_bind_sock_t *bmp; - - if (session->is_listen) { - /* Already in the listen state */ - return 0; - } - - clib_fifo_resize(session->accept_session_index_fifo, SPDK_VPP_LISTEN_QUEUE_SIZE); - - session->is_server = 1; - bmp = vl_msg_api_alloc(sizeof(*bmp)); - if (bmp == NULL) { - return -ENOMEM; - } - memset(bmp, 0, sizeof(*bmp)); - - bmp->_vl_msg_id = ntohs(VL_API_BIND_SOCK); - bmp->client_index = g_svm.my_client_index; - bmp->context = session->id; - bmp->vrf = 0; - bmp->is_ip4 = session->app_session.transport.is_ip4; - memcpy(bmp->ip, &session->app_session.transport.lcl_ip, sizeof(bmp->ip)); - bmp->port = session->app_session.transport.lcl_port; - bmp->proto = TRANSPORT_PROTO_TCP; - - vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&bmp); - - return _wait_for_session_state_change(session, VPP_SESSION_STATE_READY); -} - -static struct spdk_sock * -vpp_sock_create(const char *ip, int port, enum spdk_vpp_create_type type, - struct spdk_sock_opts *opts) -{ - struct spdk_vpp_session *session; - int rc; - uint8_t is_ip4 = 0; - ip46_address_t addr_buf; - - if (!g_svm.vpp_initialized || ip == NULL) { - return NULL; - } - - session = vpp_session_create(); - if (session == NULL) { - SPDK_ERRLOG("vpp_session_create() failed\n"); - errno = ENOMEM; - return NULL; - } - - /* Check address family */ - if (inet_pton(AF_INET, ip, &addr_buf.ip4.as_u8)) { - is_ip4 = 1; - } else if (inet_pton(AF_INET6, ip, &addr_buf.ip6.as_u8)) { - is_ip4 = 0; - } else { - SPDK_ERRLOG("IP address with invalid format\n"); - errno = EAFNOSUPPORT; - goto err; - } - - if (type == SPDK_SOCK_CREATE_LISTEN) { - session->app_session.transport.is_ip4 = is_ip4; - memcpy(&session->app_session.transport.lcl_ip, &addr_buf, sizeof(addr_buf)); - session->app_session.transport.lcl_port = htons(port); - - rc = vpp_session_listen(session); - if (rc != 0) { - errno = -rc; - SPDK_ERRLOG("session_listen() failed\n"); - goto err; - } - } else if (type == SPDK_SOCK_CREATE_CONNECT) { - session->app_session.transport.is_ip4 = is_ip4; - memcpy(&session->app_session.transport.rmt_ip, &addr_buf, sizeof(addr_buf)); - session->app_session.transport.rmt_port = htons(port); - - rc = vpp_session_connect(session); - if (rc != 0) { - SPDK_ERRLOG("session_connect() failed\n"); - goto err; - } - } - - return &session->base; - -err: - vpp_session_free(session); - return NULL; -} - -static struct spdk_sock * -vpp_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) -{ - return vpp_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts); -} - -static struct spdk_sock * -vpp_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) -{ - return vpp_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts); -} - -static struct spdk_sock * -vpp_sock_accept(struct spdk_sock *_sock) -{ - struct spdk_vpp_session *listen_session = __vpp_session(_sock); - struct spdk_vpp_session *client_session = NULL; - u32 client_session_index = ~0; - uword elts = 0; - app_session_evt_t app_evt; - session_accepted_reply_msg_t *rmp; - - assert(listen_session != NULL); - assert(g_svm.vpp_initialized); - - if (listen_session->app_session.session_state != VPP_SESSION_STATE_READY) { - /* Listen session should be in the listen state */ - errno = EWOULDBLOCK; - return NULL; - } - - pthread_mutex_lock(&listen_session->accept_session_lock); - - if (listen_session->accept_session_index_fifo != NULL) { - elts = clib_fifo_elts(listen_session->accept_session_index_fifo); - } - - if (elts == 0) { - /* No client sessions */ - errno = EAGAIN; - pthread_mutex_unlock(&listen_session->accept_session_lock); - return NULL; - } - - clib_fifo_sub1(listen_session->accept_session_index_fifo, - client_session_index); - - pthread_mutex_unlock(&listen_session->accept_session_lock); - - client_session = vpp_session_get(client_session_index); - if (client_session == NULL) { - SPDK_ERRLOG("client session closed or aborted\n"); - errno = ECONNABORTED; - return NULL; - } - - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Client %p(%" PRIu32 ") accepted.\n", - client_session, client_session_index); - - /* - * Send accept session reply - */ - app_alloc_ctrl_evt_to_vpp(client_session->app_session.vpp_evt_q, &app_evt, - SESSION_CTRL_EVT_ACCEPTED_REPLY); - rmp = (session_accepted_reply_msg_t *) app_evt.evt->data; - rmp->handle = client_session->handle; - rmp->context = client_session->context; - app_send_ctrl_evt_to_vpp(client_session->app_session.vpp_evt_q, &app_evt); - - return &client_session->base; -} - -static int -vpp_sock_close(struct spdk_sock *_sock) -{ - struct spdk_vpp_session *session = __vpp_session(_sock); - - assert(session != NULL); - assert(g_svm.vpp_initialized); - - if (session->is_listen) { - send_unbind_sock(session); - } else { - vpp_session_disconnect(session); - } - vpp_session_free(session); - - return 0; -} - -static ssize_t -vpp_sock_recv(struct spdk_sock *_sock, void *buf, size_t len) -{ - struct spdk_vpp_session *session = __vpp_session(_sock); - int rc; - svm_fifo_t *rx_fifo; - uint32_t bytes; - - assert(session != NULL); - assert(g_svm.vpp_initialized); - - rx_fifo = session->app_session.rx_fifo; - - bytes = svm_fifo_max_dequeue(session->app_session.rx_fifo); - if (bytes > (ssize_t)len) { - bytes = len; - } - - if (bytes == 0) { - if (session->app_session.session_state == VPP_SESSION_STATE_DISCONNECT) { - /* Socket is disconnected */ - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Client %p(%" PRIu32 ") is disconnected.\n", - session, session->id); - errno = 0; - return 0; - } - errno = EAGAIN; - return -1; - } - - rc = app_recv_stream_raw(rx_fifo, buf, bytes, 0, 0); - if (rc < 0) { - errno = -rc; - return rc; - } - - return rc; -} - -static ssize_t -vpp_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) -{ - ssize_t total = 0; - int i, rc; - - assert(_sock != NULL); - assert(g_svm.vpp_initialized); - - for (i = 0; i < iovcnt; ++i) { - rc = vpp_sock_recv(_sock, iov[i].iov_base, iov[i].iov_len); - if (rc < 0) { - if (total > 0) { - break; - } else { - errno = -rc; - return -1; - } - } else { - total += rc; - if (rc < (ssize_t)iov[i].iov_len) { - /* Read less than buffer provided, no point to continue. */ - break; - } - } - } - return total; -} - -static ssize_t -_vpp_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) -{ - struct spdk_vpp_session *session = __vpp_session(_sock); - ssize_t total = 0; - int i, rc; - svm_fifo_t *tx_fifo; - session_evt_type_t et; - - assert(session != NULL); - assert(g_svm.vpp_initialized); - - tx_fifo = session->app_session.tx_fifo; - et = SESSION_IO_EVT_TX; - - for (i = 0; i < iovcnt; ++i) { - if (svm_fifo_is_full(tx_fifo)) { - errno = EWOULDBLOCK; - return -1; - } - - /* We use only stream connection for now */ - rc = app_send_stream_raw(tx_fifo, session->app_session.vpp_evt_q, - iov[i].iov_base, iov[i].iov_len, et, - 1, SVM_Q_WAIT); - - if (rc < 0) { - if (total > 0) { - break; - } else { - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Buffer overflow\n"); - errno = EWOULDBLOCK; - return -1; - } - } else { - total += rc; - if (rc < (ssize_t)iov[i].iov_len) { - /* Write less than buffer provided, no point to continue. */ - break; - } - } - } - - return total; -} - -static int -_sock_flush(struct spdk_sock *sock) -{ - struct iovec iovs[IOV_BATCH_SIZE]; - int iovcnt; - int retval; - struct spdk_sock_request *req; - int i; - ssize_t rc; - unsigned int offset; - size_t len; - - /* Can't flush from within a callback or we end up with recursive calls */ - if (sock->cb_cnt > 0) { - return 0; - } - - /* Gather an iov */ - iovcnt = 0; - req = TAILQ_FIRST(&sock->queued_reqs); - while (req) { - offset = req->internal.offset; - - for (i = 0; i < req->iovcnt; i++) { - /* Consume any offset first */ - if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) { - offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len; - continue; - } - - iovs[iovcnt].iov_base = SPDK_SOCK_REQUEST_IOV(req, i)->iov_base + offset; - iovs[iovcnt].iov_len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset; - iovcnt++; - - offset = 0; - - if (iovcnt >= IOV_BATCH_SIZE) { - break; - } - } - - if (iovcnt >= IOV_BATCH_SIZE) { - break; - } - - req = TAILQ_NEXT(req, internal.link); - } - - if (iovcnt == 0) { - return 0; - } - - /* Perform the vectored write */ - rc = _vpp_sock_writev(sock, iovs, iovcnt); - if (rc <= 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) { - return 0; - } - return rc; - } - - /* Consume the requests that were actually written */ - req = TAILQ_FIRST(&sock->queued_reqs); - while (req) { - offset = req->internal.offset; - - for (i = 0; i < req->iovcnt; i++) { - /* Advance by the offset first */ - if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) { - offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len; - continue; - } - - /* Calculate the remaining length of this element */ - len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset; - - if (len > (size_t)rc) { - /* This element was partially sent. */ - req->internal.offset += rc; - return 0; - } - - offset = 0; - req->internal.offset += len; - rc -= len; - } - - /* Handled a full request. */ - req->internal.offset = 0; - spdk_sock_request_pend(sock, req); - - /* The _vpp_sock_writev above isn't currently asynchronous, - * so it's already done. */ - retval = spdk_sock_request_put(sock, req, 0); - - if (rc == 0 || retval) { - break; - } - - req = TAILQ_FIRST(&sock->queued_reqs); - } - - return 0; -} - -static ssize_t -vpp_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) -{ - int rc; - - /* In order to process a writev, we need to flush any asynchronous writes - * first. */ - rc = _sock_flush(_sock); - if (rc < 0) { - return rc; - } - - if (!TAILQ_EMPTY(&_sock->queued_reqs)) { - /* We weren't able to flush all requests */ - errno = EAGAIN; - return -1; - } - - return _vpp_sock_writev(_sock, iov, iovcnt); -} - -static void -vpp_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req) -{ - int rc; - - spdk_sock_request_queue(sock, req); - - if (sock->group_impl == NULL) { - spdk_sock_request_put(sock, req, -ENOTSUP); - return; - } - - /* If there are a sufficient number queued, just flush them out immediately. */ - if (sock->queued_iovcnt >= IOV_BATCH_SIZE) { - rc = _sock_flush(sock); - if (rc) { - spdk_sock_abort_requests(sock); - } - } -} - -static int -vpp_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes) -{ - assert(g_svm.vpp_initialized); - - return 0; -} - -static int -vpp_sock_set_recvbuf(struct spdk_sock *_sock, int sz) -{ - assert(g_svm.vpp_initialized); - - return 0; -} - -static int -vpp_sock_set_sendbuf(struct spdk_sock *_sock, int sz) -{ - assert(g_svm.vpp_initialized); - - return 0; -} - -static bool -vpp_sock_is_ipv6(struct spdk_sock *_sock) -{ - return !__vpp_session(_sock)->app_session.transport.is_ip4; -} - -static bool -vpp_sock_is_ipv4(struct spdk_sock *_sock) -{ - return __vpp_session(_sock)->app_session.transport.is_ip4; -} - -static bool -vpp_sock_is_connected(struct spdk_sock *_sock) -{ - assert(g_svm.vpp_initialized); - - return (__vpp_session(_sock)->app_session.session_state == VPP_SESSION_STATE_READY); -} - -static int -vpp_sock_get_placement_id(struct spdk_sock *_sock, int *placement_id) -{ - return -1; -} - -static struct spdk_sock_group_impl * -vpp_sock_group_impl_create(void) -{ - struct spdk_vpp_sock_group_impl *group_impl; - - if (!g_svm.vpp_initialized) { - return NULL; - } - - group_impl = calloc(1, sizeof(*group_impl)); - if (group_impl == NULL) { - SPDK_ERRLOG("sock_group allocation failed\n"); - errno = ENOMEM; - return NULL; - } - - return &group_impl->base; -} - -static int -vpp_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, - struct spdk_sock *_sock) -{ - /* We expect that higher level do it for us */ - return 0; -} - -static int -vpp_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, - struct spdk_sock *_sock) -{ - /* We expect that higher level do it for us */ - return 0; -} - -static bool -vpp_session_read_ready(struct spdk_vpp_session *session) -{ - svm_fifo_t *rx_fifo = NULL; - uint32_t ready = 0; - - if (session->app_session.session_state == VPP_SESSION_STATE_DISCONNECT) { - /* If session not found force reading to close it. - * NOTE: We're expecting here that upper layer will close - * connection when next read fails. - */ - return true; - } - - if (session->app_session.session_state == VPP_SESSION_STATE_READY) { - rx_fifo = session->app_session.rx_fifo; - ready = svm_fifo_max_dequeue(rx_fifo); - } - - return ready > 0; -} - -static int -vpp_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, - struct spdk_sock **socks) -{ - int num_events, rc; - struct spdk_sock *sock, *tmp; - struct spdk_vpp_session *session; - struct spdk_vpp_sock_group_impl *group; - - assert(_group != NULL); - assert(socks != NULL); - assert(g_svm.vpp_initialized); - - group = __vpp_group_impl(_group); - num_events = 0; - - /* This must be a TAILQ_FOREACH_SAFE because while flushing, - * a completion callback could remove the sock from the - * group. */ - TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) { - rc = _sock_flush(sock); - if (rc) { - spdk_sock_abort_requests(sock); - } - } - - sock = group->last_sock; - if (sock == NULL) { - sock = TAILQ_FIRST(&group->base.socks); - } - - while (sock != NULL) { - session = __vpp_session(sock); - if (vpp_session_read_ready(session)) { - socks[num_events] = sock; - num_events++; - if (num_events >= max_events) { - sock = TAILQ_NEXT(sock, link); - break; - } - } - sock = TAILQ_NEXT(sock, link); - } - group->last_sock = sock; - - return num_events; -} - -static int -vpp_sock_group_impl_close(struct spdk_sock_group_impl *_group) -{ - free(_group); - return 0; -} - -/****************************************************************************** - * Initialize and attach to the VPP - */ -static int -vpp_app_attach(void) -{ - vl_api_application_attach_t *bmp; - u32 fifo_size = 16 << 20; - - bmp = vl_msg_api_alloc(sizeof(*bmp)); - if (bmp == NULL) { - return -ENOMEM; - } - memset(bmp, 0, sizeof(*bmp)); - - bmp->_vl_msg_id = ntohs(VL_API_APPLICATION_ATTACH); - bmp->client_index = g_svm.my_client_index; - bmp->context = ntohl(0xfeedface); - - bmp->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_ACCEPT_REDIRECT; - bmp->options[APP_OPTIONS_FLAGS] |= APP_OPTIONS_FLAGS_ADD_SEGMENT; - - bmp->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = 16; - bmp->options[APP_OPTIONS_RX_FIFO_SIZE] = fifo_size; - bmp->options[APP_OPTIONS_TX_FIFO_SIZE] = fifo_size; - bmp->options[APP_OPTIONS_ADD_SEGMENT_SIZE] = 256 << 20; - bmp->options[APP_OPTIONS_SEGMENT_SIZE] = 512 << 20; - bmp->options[APP_OPTIONS_EVT_QUEUE_SIZE] = 256; - - vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&bmp); - - return 0; -} -static void -vl_api_session_enable_disable_reply_t_handler(vl_api_session_enable_disable_reply_t *mp) -{ - if (mp->retval) { - SPDK_ERRLOG("Session enable failed (%d).\n", ntohl(mp->retval)); - } else { - SPDK_NOTICELOG("Session layer enabled\n"); - g_svm.vpp_state = VPP_STATE_ENABLED; - vpp_app_attach(); - } -} - -static int -vpp_session_enable(u8 is_enable) -{ - vl_api_session_enable_disable_t *bmp; - - bmp = vl_msg_api_alloc(sizeof(*bmp)); - if (bmp == NULL) { - return -ENOMEM; - } - memset(bmp, 0, sizeof(*bmp)); - - bmp->_vl_msg_id = ntohs(VL_API_SESSION_ENABLE_DISABLE); - bmp->client_index = g_svm.my_client_index; - bmp->context = htonl(0xfeedface); - bmp->is_enable = is_enable; - vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&bmp); - - return 0; -} - -static void -vpp_application_attached(void *arg) -{ - SPDK_NOTICELOG("VPP net framework initialized.\n"); - g_svm.vpp_state = VPP_STATE_ATTACHED; - g_svm.vpp_initialized = true; - g_svm.app_queue_poller = SPDK_POLLER_REGISTER(app_queue_poller, NULL, 100); - spdk_net_framework_init_next(0); -} - -static int -ssvm_segment_attach(char *name, ssvm_segment_type_t type, int fd) -{ - svm_fifo_segment_create_args_t a; - int rv; - - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Attaching segment %s\n", name); - - clib_memset(&a, 0, sizeof(a)); - a.segment_name = (char *) name; - a.segment_type = type; - - assert(type == SSVM_SEGMENT_MEMFD); - a.memfd_fd = fd; - - if ((rv = svm_fifo_segment_attach(&g_svm.segment_main, &a))) { - SPDK_ERRLOG("Segment '%s' attach failed (%d).\n", name, rv); - return rv; - } - - vec_reset_length(a.new_segment_indices); - return 0; -} - -static void -vl_api_application_attach_reply_t_handler(vl_api_application_attach_reply_t *mp) -{ - u32 n_fds = 0; - - if (mp->retval) { - SPDK_ERRLOG("Application attach to VPP failed (%d)\n", - ntohl(mp->retval)); - goto err; - } - - if (mp->segment_name_length == 0) { - SPDK_ERRLOG("segment_name_length zero\n"); - goto err; - } - - assert(mp->app_event_queue_address); - g_svm.app_event_queue = uword_to_pointer(mp->app_event_queue_address, svm_msg_q_t *); - - if (mp->n_fds) { - int fds[mp->n_fds]; - - vl_socket_client_recv_fd_msg(fds, mp->n_fds, 5); - - if (mp->fd_flags & SESSION_FD_F_VPP_MQ_SEGMENT) { - if (ssvm_segment_attach(0, SSVM_SEGMENT_MEMFD, fds[n_fds++])) { - goto err; - } - } - - if (mp->fd_flags & SESSION_FD_F_MEMFD_SEGMENT) { - if (ssvm_segment_attach((char *) mp->segment_name, SSVM_SEGMENT_MEMFD, fds[n_fds++])) { - goto err; - } - } - - if (mp->fd_flags & SESSION_FD_F_MQ_EVENTFD) { - svm_msg_q_set_consumer_eventfd(g_svm.app_event_queue, fds[n_fds++]); - } - } - - spdk_thread_send_msg(g_svm.init_thread, vpp_application_attached, NULL); - return; -err: - g_svm.vpp_state = VPP_STATE_FAILED; - return; -} - -/* Detach */ -static void -vpp_application_detached(void *arg) -{ - if (!g_svm.vpp_initialized) { - return; - } - - spdk_poller_unregister(&g_svm.vpp_queue_poller); - spdk_poller_unregister(&g_svm.app_queue_poller); - spdk_poller_unregister(&g_svm.timeout_poller); - - g_svm.vpp_initialized = false; - g_svm.vpp_state = VPP_STATE_START; - pthread_mutex_destroy(&g_svm.session_get_lock); - vl_socket_client_disconnect(); - - SPDK_NOTICELOG("Application detached\n"); - - spdk_net_framework_fini_next(); -} - -static int -vpp_application_detached_timeout(void *arg) -{ - if (g_svm.vpp_initialized) { - /* We need to finish detach on initial thread */ - spdk_thread_send_msg(g_svm.init_thread, vpp_application_detached, NULL); - } - return 0; -} - -static void -vl_api_application_detach_reply_t_handler(vl_api_application_detach_reply_t *mp) -{ - if (mp->retval) { - SPDK_ERRLOG("Application detach from VPP failed (%d).\n", ntohl(mp->retval)); - g_svm.vpp_state = VPP_STATE_FAILED; - } - - /* We need to finish detach on initial thread */ - spdk_thread_send_msg(g_svm.init_thread, vpp_application_detached, NULL); -} - -static int -vpp_app_detach(void) -{ - vl_api_application_detach_t *bmp; - - bmp = vl_msg_api_alloc(sizeof(*bmp)); - if (bmp == NULL) { - return -ENOMEM; - } - memset(bmp, 0, sizeof(*bmp)); - - bmp->_vl_msg_id = ntohs(VL_API_APPLICATION_DETACH); - bmp->client_index = g_svm.my_client_index; - bmp->context = ntohl(0xfeedface); - vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&bmp); - - g_svm.timeout_poller = SPDK_POLLER_REGISTER(vpp_application_detached_timeout, - NULL, 10000000); - - return 0; -} - -static void -vl_api_map_another_segment_t_handler(vl_api_map_another_segment_t *mp) -{ - ssvm_segment_type_t seg_type = SSVM_SEGMENT_SHM; - int fd = -1; - - if (mp->fd_flags) { - vl_socket_client_recv_fd_msg(&fd, 1, 5); - seg_type = SSVM_SEGMENT_MEMFD; - } - - if (ssvm_segment_attach((char *) mp->segment_name, - seg_type, fd)) { - SPDK_ERRLOG("svm_fifo_segment_attach ('%s') failed\n", - mp->segment_name); - return; - } - - SPDK_DEBUGLOG(SPDK_SOCK_VPP, "New segment ('%s') attached\n", - mp->segment_name); -} - -static void -vpp_net_framework_set_handlers(void) -{ - /* Set up VPP handlers */ -#define _(N,n) \ - vl_msg_api_set_handlers(VL_API_##N, #n, \ - vl_api_##n##_t_handler, \ - vl_noop_handler, \ - vl_api_##n##_t_endian, \ - vl_api_##n##_t_print, \ - sizeof(vl_api_##n##_t), 1); - _(SESSION_ENABLE_DISABLE_REPLY, session_enable_disable_reply) \ - _(DISCONNECT_SESSION_REPLY, disconnect_session_reply) \ - _(APPLICATION_ATTACH_REPLY, application_attach_reply) \ - _(APPLICATION_DETACH_REPLY, application_detach_reply) \ - _(MAP_ANOTHER_SEGMENT, map_another_segment) -#undef _ -} - -static void -vpp_net_framework_init(void) -{ - char *app_name; - api_main_t *am = &api_main; - - clib_mem_init_thread_safe(0, SPDK_VPP_CLIB_MEM_SIZE); - svm_fifo_segment_main_init(&g_svm.segment_main, SPDK_VPP_SEGMENT_BASEVA, - SPDK_VPP_SEGMENT_TIMEOUT); - - app_name = spdk_sprintf_alloc("SPDK_%d", getpid()); - if (app_name == NULL) { - SPDK_ERRLOG("Cannot alloc memory for SPDK app name\n"); - return; - } - - vpp_net_framework_set_handlers(); - - if (vl_socket_client_connect((char *) API_SOCKET_FILE, app_name, - 0 /* default rx, tx buffer */)) { - SPDK_ERRLOG("Client \"%s\" failed to connect to the socket \"%s\".\n", - app_name, API_SOCKET_FILE); - goto err; - } - - if (vl_socket_client_init_shm(0, 0 /* want_pthread */)) { - SPDK_ERRLOG("SHM API initialization failed.\n"); - vl_socket_client_disconnect(); - goto err; - } - - g_svm.vl_input_queue = am->shmem_hdr->vl_input_queue; - g_svm.vl_output_queue = am->vl_input_queue; - - g_svm.my_client_index = am->my_client_index; - pthread_mutex_init(&g_svm.session_get_lock, NULL); - - free(app_name); - - g_svm.init_thread = spdk_get_thread(); - SPDK_NOTICELOG("Enable VPP session\n"); - - g_svm.vpp_queue_poller = SPDK_POLLER_REGISTER(vpp_queue_poller, NULL, 100); - - vpp_session_enable(1); - - return; - -err: - free(app_name); - spdk_net_framework_init_next(0); -} - -/****************************************************************************** - * Register components - */ -static struct spdk_net_impl g_vpp_net_impl = { - .name = "vpp", - .getaddr = vpp_sock_getaddr, - .connect = vpp_sock_connect, - .listen = vpp_sock_listen, - .accept = vpp_sock_accept, - .close = vpp_sock_close, - .recv = vpp_sock_recv, - .readv = vpp_sock_readv, - .writev = vpp_sock_writev, - .writev_async = vpp_sock_writev_async, - .set_recvlowat = vpp_sock_set_recvlowat, - .set_recvbuf = vpp_sock_set_recvbuf, - .set_sendbuf = vpp_sock_set_sendbuf, - .is_ipv6 = vpp_sock_is_ipv6, - .is_ipv4 = vpp_sock_is_ipv4, - .is_connected = vpp_sock_is_connected, - .get_placement_id = vpp_sock_get_placement_id, - .group_impl_create = vpp_sock_group_impl_create, - .group_impl_add_sock = vpp_sock_group_impl_add_sock, - .group_impl_remove_sock = vpp_sock_group_impl_remove_sock, - .group_impl_poll = vpp_sock_group_impl_poll, - .group_impl_close = vpp_sock_group_impl_close, -}; - -SPDK_NET_IMPL_REGISTER(vpp, &g_vpp_net_impl, DEFAULT_SOCK_PRIORITY + 2); - -static void -vpp_net_framework_fini(void) -{ - if (g_svm.vpp_initialized) { - vpp_app_detach(); - } else { - spdk_net_framework_fini_next(); - } -} - -static struct spdk_net_framework g_vpp_net_framework = { - .name = "vpp", - .init = vpp_net_framework_init, - .fini = vpp_net_framework_fini, -}; - -SPDK_NET_FRAMEWORK_REGISTER(vpp, &g_vpp_net_framework); - -SPDK_LOG_REGISTER_COMPONENT("sock_vpp", SPDK_SOCK_VPP) - -#endif /* __clang_analyzer__ */ diff --git a/ocf b/ocf index 9d079556408..02f6fc3a719 160000 --- a/ocf +++ b/ocf @@ -1 +1 @@ -Subproject commit 9d0795564082b5dfb489ab3fc2fa22cb1538ab85 +Subproject commit 02f6fc3a719f761eee35063d49677e891b02a31b diff --git a/pkg/spdk.spec b/pkg/spdk.spec index c2f311277e0..ffe884771ad 100644 --- a/pkg/spdk.spec +++ b/pkg/spdk.spec @@ -86,7 +86,6 @@ BuildArch: noarch --without-fio \ --with-vhost \ --without-pmdk \ - --without-vpp \ --without-rbd \ --with-rdma \ --with-shared \ diff --git a/scripts/bash-completion/spdk b/scripts/bash-completion/spdk new file mode 100644 index 00000000000..1e221f6cbe6 --- /dev/null +++ b/scripts/bash-completion/spdk @@ -0,0 +1,269 @@ +# shellcheck disable=SC2016,SC2207 + +_get_help() { + "$@" -h 2>&1 +} + +_get_help_opt() { + # Fetch all the optional parameters with help from _parse_help() + _parse_help - < <(printf '%s\n' "$@") +} + +_get_help_pos() { + local pos + + # Fetch all the positional parameters, i.e. get first word prefixed + # with 20h x 2. This may not be 100% accurate. Also, it won't return + # any usuable strings, it's just meant to point out what type of + # mandatory argument given method depends on, like bdev_name, etc. + # TODO: separate completion for such arguments, e.g., get all bdevs + # for parameter like bdev_name? + while read -r; do + [[ $REPLY =~ ^\ {2}[^\ -] ]] || continue + read -r pos _ <<< "$REPLY" && echo "$pos" + done < <(printf '%s\n' "$@") +} + +_get_default_rpc_methods() { + if [[ -S $rpc_sock ]]; then + _get_supported_methods "$1" + return 0 + fi + + local aliases method names + # Don't squash whitespaces, slurp the entire line + while read -r; do + # Each method name seems to be prefixed with 20h x 4. Then it can + # be followed with list of aliases enclosed inside (). Example: + # ioat_scan_accel_engine (ioat_scan_copy_engine, scan_ioat_copy_engine) + [[ $REPLY =~ ^\ {4}([a-z]+(_[a-z]+)*)(\ *\((.+)\))? ]] || continue + + names=("${BASH_REMATCH[1]}") + if [[ $SPDK_RPC_ALIASES == yes ]] && [[ -n ${BASH_REMATCH[4]} ]]; then + IFS=", " read -ra aliases <<< "${BASH_REMATCH[4]}" + names+=("${aliases[@]}") + fi + + for method in "${names[@]}"; do + rpc_methods["$method"]=1 + done + done < <(_get_help "$1" 2> /dev/null) +} + +_get_supported_methods() { + local method methods + + mapfile -t methods < <("$1" -s "$rpc_sock" rpc_get_methods 2> /dev/null) + ((${#methods[@]} > 0)) || return 0 + + # Kill the json flavor + methods=("${methods[@]//+(\"|,| )/}") + unset -v "methods[0]" "methods[-1]" # [] + + for method in "${methods[@]}"; do + rpc_methods["$method"]=1 + done +} + +_get_help_rpc_method() { + local rpc=$1 + local method=$2 + local rpc_help opt + + mapfile -t rpc_help < <(_get_help "$rpc" "$method") + + _get_help_pos "${rpc_help[@]}" + _get_help_opt "${rpc_help[@]}" +} + +_is_rpc_method() { + local word=$1 + + [[ -v rpc_methods["$word"] ]] +} + +_method_in_words() { + for word in "${words[@]}"; do + if _is_rpc_method "$word"; then + echo "$word" + return 0 + fi + done + return 1 +} + +_set_rpc_sock() { + # Look for unix sock each app creates upon its execution. In + # first instance, check the cmdline for an -s arg, if it's + # followed by the path to the sock, use it. + + local word + for ((word = 0; word < ${#words[@]}; word++)); do + if [[ ${words[word]} == -s && -S ${words[word + 1]} ]]; then + rpc_sock=${words[word + 1]} + return 0 + fi + done + + # default .sock + [[ -S /var/tmp/spdk.sock ]] && rpc_sock=/var/tmp/spdk.sock + + return 0 +} + +_spdk_opt_to_complete() { + local opt=$1 + + case "$opt" in + --pci-blacklist | -B | --pci-whitelist | -W) + local pcis + if [[ -e /sys/bus/pci/devices ]]; then + pcis=(/sys/bus/pci/devices/*) + pcis=("${pcis[@]##*/}") + fi + COMPREPLY=($(compgen -W '${pcis[*]}' -- "$cur")) + compopt -o filenames + ;; + --master-core | -p) # FIXME: Is this meant to be an actual core id or thread id? Assume the latter + local cpus + if [[ -e /sys/devices/system/cpu ]]; then + cpus=(/sys/devices/system/cpu/cpu+([0-9])) + cpus=("${cpus[@]##*cpu}") + fi + COMPREPLY=($(compgen -W '${cpus[*]}' -- "$cur")) + ;; + --iova-mode) + COMPREPLY=($(compgen -W 'pa va' -- "$cur")) + ;; + --tpoint-group-mask | -e) + COMPREPLY=($(compgen -W '$(_get_tpoint_g_masks)' -- "$cur")) + compopt -o nosort + ;; + --logflag) + COMPREPLY=($(compgen -W '$(_get_log_flags)' -- "$cur")) + ;; + --huge-dir) + COMPREPLY=($(compgen -W '$(_get_fs_mounts "hugetlbfs")' -- "$cur")) + compopt -o filenames + ;; + --iflag | --oflag) # spdk_dd specific + if [[ ${app##*/} == spdk_dd ]]; then + COMPREPLY=($(compgen -W '$(_get_help_pos "${app_help[@]}")' -- "$cur")) + fi + ;; + *) return 1 ;; + esac + return 0 +} + +_get_fs_mounts() { + [[ $(< /proc/filesystems) == *"$1"* ]] || return 0 + + local mount fs mounts + while read -r _ mount fs _; do + [[ $fs == "$1" ]] && mounts+=("$mount") + done < /proc/mounts + + if ((${#mounts[@]} > 0)); then + printf '%s\n' "${mounts[@]}" + fi +} + +_get_from_spdk_help() { + _get_help "$app" |& grep "$1" +} + +_get_tpoint_g_masks() { + local g_masks + + g_masks=$(_get_from_spdk_help "tracepoint group mask for spdk trace buffers") || return 0 + [[ $g_masks =~ \((.+)\) ]] || return 0 + + IFS=", " read -ra g_masks <<< "${BASH_REMATCH[1]}" + printf '%s\n' "${g_masks[@]}" +} + +_get_log_flags() { + local logflags + + logflags=$(_get_from_spdk_help "enable debug log flag") || return 0 + [[ $logflags =~ \((.+)\) ]] || return 0 + + if [[ -n ${BASH_REMATCH[1]} && ${BASH_REMATCH[1]} != "not supported"* ]]; then + IFS=", " read -ra logflags <<< "${BASH_REMATCH[1]}" + printf '%s\n' "${logflags[@]}" + fi +} + +_is_app() { + type -P "$1" > /dev/null +} + +_rpc() { + local cur prev words + + _init_completion || return + _is_app "$1" || return + + local rpc=$1 rpc_sock="" method="" + local -A rpc_methods=() + + _set_rpc_sock + _get_default_rpc_methods "$rpc" + + if method=$(_method_in_words); then + COMPREPLY=($(compgen -W '$(_get_help_rpc_method "$rpc" "$method")' -- "$cur")) + compopt -o nosort + elif [[ $cur == -* ]]; then + COMPREPLY=($(compgen -W '$(_parse_help "$rpc")' -- "$cur")) + elif [[ $prev == --verbose ]]; then + COMPREPLY=($(compgen -W 'DEBUG INFO ERROR' -- "$cur")) + elif [[ $prev == -s ]]; then + _filedir + else + COMPREPLY=($(compgen -W '${!rpc_methods[*]}' -- "$cur")) + fi +} + +_spdk_app() { + local cur prev + + _init_completion || return + _is_app "$1" || return + + local app=$1 app_help + + mapfile -t app_help < <(_get_help "$app") + + if [[ $cur == -* ]]; then + COMPREPLY=($(compgen -W '$(_get_help_opt "${app_help[@]}")' -- "$cur")) + else + _spdk_opt_to_complete "$prev" || _filedir + fi +} + +# Build simple completion for some common spdk apps|tools +_spdk_apps() { + local apps + + apps=( + iscsi_tgt + iscsi_top + nvmf_tgt + spdk_dd + spdk_tgt + spdk_top + spdk_trace_record + vhost + create_vbox.sh + create_vhost_vm.sh + pkgdep.sh + run-autorun.sh + vm_setup.sh + ) # TODO: Add more? + + complete -F _spdk_app "${apps[@]}" + complete -F _rpc rpc.py +} + +_spdk_apps diff --git a/scripts/check_format.sh b/scripts/check_format.sh index 72840f3401a..f4df5c41b17 100755 --- a/scripts/check_format.sh +++ b/scripts/check_format.sh @@ -96,7 +96,7 @@ if hash astyle; then | xargs -P$(nproc) -n10 astyle --options=.astylerc >> astyle.log if grep -q "^Formatted" astyle.log; then echo " errors detected" - git diff + git diff --ignore-submodules=all sed -i -e 's/ / /g' astyle.log grep --color=auto "^Formatted.*" astyle.log echo "Incorrect code style detected in one or more files." @@ -233,7 +233,7 @@ for c_file in "${changed_c_libs[@]}"; do # Capture just the names of newly added (or modified) functions that start with "spdk_" mapfile -t defined_symbols < <(git diff -U0 $commit_to_compare HEAD -- $c_file | sed -En 's/(^[+])(spdk[a-z,A-Z,0-9,_]*)(\(.*)/\2/p') # Capture the names of removed symbols to catch edge cases where we just move definitions around. - mapfile -t removed_symbols < <(git diff -U0 HEAD $commit_to_compare -- $c_file | sed -En 's/(^[+])(spdk[a-z,A-Z,0-9,_]*)(\(.*)/\2/p') + mapfile -t removed_symbols < <(git diff -U0 $commit_to_compare HEAD -- $c_file | sed -En 's/(^[-])(spdk[a-z,A-Z,0-9,_]*)(\(.*)/\2/p') for symbol in "${removed_symbols[@]}"; do defined_symbols=("${defined_symbols[@]/$symbol/}") done diff --git a/scripts/common.sh b/scripts/common.sh index c6b9a53f8a6..60de9008020 100644 --- a/scripts/common.sh +++ b/scripts/common.sh @@ -49,8 +49,6 @@ cache_pci() { fi if [[ -n $vendor && -n $device ]]; then vendor=0x${vendor/0x/} device=0x${device/0x/} - pci_bus_cache["$vendor"]="${pci_bus_cache["$vendor"]:+${pci_bus_cache["$vendor"]} }$pci" - pci_bus_cache["$device"]="${pci_bus_cache["$device"]:+${pci_bus_cache["$device"]} }$pci" pci_bus_cache["$vendor:$device"]="${pci_bus_cache["$vendor:$device"]:+${pci_bus_cache["$vendor:$device"]} }$pci" pci_ids_vendor["$pci"]=$vendor @@ -212,3 +210,28 @@ function iter_pci_class_code() { fi done } + +function nvme_in_userspace() { + # Check used drivers. If it's not vfio-pci or uio-pci-generic + # then most likely PCI_WHITELIST option was used for setup.sh + # and we do not want to use that disk. + + local bdf bdfs + local nvmes + + if [[ -n ${pci_bus_cache["0x010802"]} ]]; then + nvmes=(${pci_bus_cache["0x010802"]}) + else + nvmes=($(iter_pci_class_code 01 08 02)) + fi + + for bdf in "${nvmes[@]}"; do + if [[ -e /sys/bus/pci/drivers/nvme/$bdf ]] \ + || [[ $(uname -s) == FreeBSD && $(pciconf -l "pci$bdf") == nvme* ]]; then + continue + fi + bdfs+=("$bdf") + done + ((${#bdfs[@]})) || return 1 + printf '%s\n' "${bdfs[@]}" +} diff --git a/scripts/gen_nvme.sh b/scripts/gen_nvme.sh index 6e1f5b517f3..47b72c31d08 100755 --- a/scripts/gen_nvme.sh +++ b/scripts/gen_nvme.sh @@ -13,41 +13,50 @@ function create_classic_config() { } function create_json_config() { - echo "{" - echo '"subsystem": "bdev",' - echo '"config": [' - for ((i = 0; i < ${#bdfs[@]}; i++)); do - echo '{' - echo '"params": {' - echo '"trtype": "PCIe",' - echo "\"name\": \"Nvme$i\"," - echo "\"traddr\": \"${bdfs[i]}\"" - echo '},' - echo '"method": "bdev_nvme_attach_controller"' - if [ -z ${bdfs[i + 1]} ]; then - echo '}' - else - echo '},' - fi + local bdev_json_cfg=() + + for i in "${!bdfs[@]}"; do + bdev_json_cfg+=("$( + cat <<- JSON + { + "method": "bdev_nvme_attach_controller", + "params": { + "trtype": "PCIe", + "name":"Nvme${i}", + "traddr":"${bdfs[i]}" + } + } + JSON + )") done - echo ']' - echo '}' + + local IFS="," + cat <<- JSON + { + "subsystem": "bdev", + "config": [ + ${bdev_json_cfg[*]} + ] + } + JSON +} + +function create_json_config_with_subsystems() { + cat <<- JSON + { + "subsystems": [ + $(create_json_config) + ] + } + JSON } -bdfs=() -# Check used drivers. If it's not vfio-pci or uio-pci-generic -# then most likely PCI_WHITELIST option was used for setup.sh -# and we do not want to use that disk. -for bdf in $(iter_pci_class_code 01 08 02); do - if [[ -e /sys/bus/pci/drivers/nvme/$bdf ]] \ - || [[ $(uname -s) == FreeBSD && $(pciconf -l "pci$bdf") == nvme* ]]; then - continue - fi - bdfs+=("$bdf") -done +bdfs=($(nvme_in_userspace)) if [ "$1" = "--json" ]; then create_json_config +elif [ "$1" = "--json-with-subsystems" ]; then + create_json_config_with_subsystems else create_classic_config fi diff --git a/scripts/perf/nvmf/README.md b/scripts/perf/nvmf/README.md index 69e6acd91b1..cd0cd64b416 100644 --- a/scripts/perf/nvmf/README.md +++ b/scripts/perf/nvmf/README.md @@ -33,10 +33,15 @@ So for example providing 2 IP's with 16 NVMe drives present will result in each "spdk" or "kernel" values allowed. -### use_null_block +### null_block_devices -Use null block device instead of present NVMe drives. Used for latency measurements as described -in Test Case 3 of performance report. +Integer. Use null block devices instead of present NVMe drives. +If set to 1, can be used for latency measurements as described in Test Case 3 of performance report. + +### null_block_dif_type + +Integer. Enable data protection on created null block device. Defaults to 0 if option +not present in JSON configuration file. See doc/jsonrpc.md "bdev_null_create" for details. ### num_cores @@ -52,6 +57,10 @@ by default. Not used if "mode" is set to "spdk". Number of shared buffers to use when creating transport layer. +### dif_insert_strip + +Boolean. If set to true - enable "dif_insert_or_strip" option for TCP transport layer. + ## Initiator Describes initiator arguments. There can be more than one initiator section in the configuration file. @@ -104,7 +113,7 @@ other than -t, -s, -n and -a. Fio job parameters. - bs: block size -- qd: io depth +- qd: io depth - Per connected fio filename target - rw: workload mode - rwmixread: percentage of reads in readwrite workloads - run_time: time (in seconds) to run workload @@ -131,7 +140,7 @@ as a runtime environment parameter. When the test completes, you will find a csv file (nvmf_results.csv) containing the results in the target node directory /tmp/results. -#Processor Counter Monitor (PCM) +# Processor Counter Monitor (PCM) PCM Tools provides a number of command-line utilities for real-time monitoring. Before using PCM Tools in nvmf perf scripts it needs to be installed on Target machine. PCM source and instructions are available on https://github.com/opcm/pcm. @@ -145,3 +154,15 @@ example: ``` Example above will run PCM measure for cpu and memory, with start delay 10s, sample every 1 second, and 30 samples for cpu measure. PCM memory do not support sample count. + +# Bandwidth monitor (bwm-ng) +PCM Tools provides a number of command-line utilities for real-time monitoring. +Before using bwm-ng in nvmf perf scripts it needs to be installed on Target machine. +To enable bandwidth monitor in perf test you need to add Target setting in config.json file: +``` +"bandwidth_settings": [bool, sample_count] +``` +example: +``` +"bandwidth_settings": [true, 30] +``` diff --git a/scripts/perf/nvmf/run_nvmf.py b/scripts/perf/nvmf/run_nvmf.py index 7cf0e78cbfa..4171ea2a08d 100755 --- a/scripts/perf/nvmf/run_nvmf.py +++ b/scripts/perf/nvmf/run_nvmf.py @@ -14,6 +14,7 @@ import rpc import rpc.client import pandas as pd +from collections import OrderedDict from common import * @@ -36,13 +37,16 @@ def log_print(self, msg): class Target(Server): def __init__(self, name, username, password, mode, nic_ips, transport="rdma", - use_null_block=False, sar_settings=None, pcm_settings=None): + null_block_devices=0, sar_settings=None, pcm_settings=None, + bandwidth_settings=None, dpdk_settings=None): super(Target, self).__init__(name, username, password, mode, nic_ips, transport) - self.null_block = bool(use_null_block) + self.null_block = null_block_devices self.enable_sar = False self.enable_pcm_memory = False self.enable_pcm = False + self.enable_bandwidth = False + self.enable_dpdk_memory = False if sar_settings: self.enable_sar, self.sar_delay, self.sar_interval, self.sar_count = sar_settings @@ -50,6 +54,12 @@ def __init__(self, name, username, password, mode, nic_ips, transport="rdma", if pcm_settings: self.pcm_dir, self.enable_pcm, self.enable_pcm_memory, self.pcm_delay, self.pcm_interval, self.pcm_count = pcm_settings + if bandwidth_settings: + self.enable_bandwidth, self.bandwidth_count = bandwidth_settings + + if dpdk_settings: + self.enable_dpdk_memory, self.dpdk_wait_time = dpdk_settings + self.script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) self.spdk_dir = os.path.abspath(os.path.join(self.script_dir, "../../../")) @@ -126,23 +136,38 @@ def parse_results(self, results_dir, initiator_count=None, run_num=None): fio_files = filter(lambda x: ".fio" in x, files) json_files = [x for x in files if ".json" in x] + headers = ["read_iops", "read_bw", "read_avg_lat_us", "read_min_lat_us", "read_max_lat_us", + "read_p99_lat_us", "read_p99.9_lat_us", "read_p99.99_lat_us", "read_p99.999_lat_us", + "write_iops", "write_bw", "write_avg_lat_us", "write_min_lat_us", "write_max_lat_us", + "write_p99_lat_us", "write_p99.9_lat_us", "write_p99.99_lat_us", "write_p99.999_lat_us"] + + aggr_headers = ["iops", "bw", "avg_lat_us", "min_lat_us", "max_lat_us", + "p99_lat_us", "p99.9_lat_us", "p99.99_lat_us", "p99.999_lat_us"] + + header_line = ",".join(["Name", *headers]) + aggr_header_line = ",".join(["Name", *aggr_headers]) + # Create empty results file csv_file = "nvmf_results.csv" with open(os.path.join(results_dir, csv_file), "w") as fh: - header_line = ",".join(["Name", - "read_iops", "read_bw", "read_avg_lat_us", - "read_min_lat_us", "read_max_lat_us", "read_p99_lat_us", - "read_p99.9_lat_us", "read_p99.99_lat_us", "read_p99.999_lat_us", - "write_iops", "write_bw", "write_avg_lat_us", - "write_min_lat_us", "write_max_lat_us", "write_p99_lat_us", - "write_p99.9_lat_us", "write_p99.99_lat_us", "write_p99.999_lat_us"]) - fh.write(header_line + "\n") + fh.write(aggr_header_line + "\n") rows = set() for fio_config in fio_files: self.log_print("Getting FIO stats for %s" % fio_config) job_name, _ = os.path.splitext(fio_config) + # Look in the filename for rwmixread value. Function arguments do + # not have that information. + # TODO: Improve this function by directly using workload params instead + # of regexing through filenames. + if "read" in job_name: + rw_mixread = 1 + elif "write" in job_name: + rw_mixread = 0 + else: + rw_mixread = float(re.search(r"m_(\d+)", job_name).group(1)) / 100 + # If "_CPU" exists in name - ignore it # Initiators for the same job could have diffrent num_cores parameter job_name = re.sub(r"_\d+CPU", "", job_name) @@ -159,6 +184,7 @@ def parse_results(self, results_dir, initiator_count=None, run_num=None): self.log_print("\tGetting stats for initiator %s" % i) # There may have been more than 1 test run for this job, calculate average results for initiator i_results = [x for x in job_result_files if i in x] + i_results_filename = re.sub(r"run_\d+_", "", i_results[0].replace("json", "csv")) separate_stats = [] for r in i_results: @@ -166,19 +192,36 @@ def parse_results(self, results_dir, initiator_count=None, run_num=None): separate_stats.append(stats) self.log_print(stats) - z = [sum(c) for c in zip(*separate_stats)] - z = [c/len(separate_stats) for c in z] - inits_avg_results.append(z) + init_results = [sum(x) for x in zip(*separate_stats)] + init_results = [x / len(separate_stats) for x in init_results] + inits_avg_results.append(init_results) self.log_print("\tAverage results for initiator %s" % i) - self.log_print(z) + self.log_print(init_results) + with open(os.path.join(results_dir, i_results_filename), "w") as fh: + fh.write(header_line + "\n") + fh.write(",".join([job_name, *["{0:.3f}".format(x) for x in init_results]]) + "\n") + + # Sum results of all initiators running this FIO job. + # Latency results are an average of latencies from accros all initiators. + inits_avg_results = [sum(x) for x in zip(*inits_avg_results)] + inits_avg_results = OrderedDict(zip(headers, inits_avg_results)) + for key in inits_avg_results: + if "lat" in key: + inits_avg_results[key] /= len(inits_names) + + # Aggregate separate read/write values into common labels + # Take rw_mixread into consideration for mixed read/write workloads. + aggregate_results = OrderedDict() + for h in aggr_headers: + read_stat, write_stat = [float(value) for key, value in inits_avg_results.items() if h in key] + if "lat" in h: + _ = rw_mixread * read_stat + (1 - rw_mixread) * write_stat + else: + _ = read_stat + write_stat + aggregate_results[h] = "{0:.3f}".format(_) - # Sum average results of all initiators running this FIO job - self.log_print("\tTotal results for %s from all initiators" % fio_config) - for a in inits_avg_results: - self.log_print(a) - total = ["{0:.3f}".format(sum(c)) for c in zip(*inits_avg_results)] - rows.add(",".join([job_name, *total])) + rows.add(",".join([job_name, *aggregate_results.values()])) # Save results to file for row in rows: @@ -216,6 +259,17 @@ def measure_pcm(self, results_dir, pcm_file_name): skt_pcm_file_name = "_".join(["skt", pcm_file_name]) skt.to_csv(os.path.join(results_dir, skt_pcm_file_name), index=False) + def measure_bandwidth(self, results_dir, bandwidth_file_name): + bwm = subprocess.run("bwm-ng -o csv -F %s/%s -a 1 -t 1000 -c %s" % (results_dir, bandwidth_file_name, + self.bandwidth_count), shell=True, check=True) + + def measure_dpdk_memory(self, results_dir): + self.log_print("INFO: waiting to generate DPDK memory usage") + time.sleep(self.dpdk_wait_time) + self.log_print("INFO: generating DPDK memory usage") + rpc.env.env_dpdk_get_mem_stats + os.rename("/tmp/spdk_mem_dump.txt", "%s/spdk_mem_dump.txt" % (results_dir)) + class Initiator(Server): def __init__(self, name, username, password, mode, nic_ips, ip, transport="rdma", cpu_frequency=None, @@ -303,6 +357,10 @@ def discover_subsystems(self, address_list, subsys_no): return subsystems + def gen_fio_filename_conf(self, *args, **kwargs): + # Logic implemented in SPDKInitiator and KernelInitiator classes + pass + def gen_fio_config(self, rw, rwmixread, block_size, io_depth, subsys_no, num_jobs=None, ramp_time=0, run_time=10): fio_conf_template = """ [global] @@ -317,7 +375,6 @@ def gen_fio_config(self, rw, rwmixread, block_size, io_depth, subsys_no, num_job rw={rw} rwmixread={rwmixread} bs={block_size} -iodepth={io_depth} time_based=1 ramp_time={ramp_time} runtime={run_time} @@ -331,7 +388,7 @@ def gen_fio_config(self, rw, rwmixread, block_size, io_depth, subsys_no, num_job else: ioengine = "libaio" spdk_conf = "" - out, err = self.remote_call("lsblk -o NAME -nlp") + out, err = self.remote_call("sudo nvme list | grep -E 'SPDK|Linux' | awk '{print $1}'") subsystems = [x for x in out.split("\n") if "nvme" in x] if self.cpus_allowed is not None: @@ -354,13 +411,13 @@ def gen_fio_config(self, rw, rwmixread, block_size, io_depth, subsys_no, num_job threads = range(0, len(subsystems)) if "spdk" in self.mode: - filename_section = self.gen_fio_filename_conf(subsystems, threads) + filename_section = self.gen_fio_filename_conf(subsystems, threads, io_depth, num_jobs) else: - filename_section = self.gen_fio_filename_conf(threads) + filename_section = self.gen_fio_filename_conf(threads, io_depth, num_jobs) fio_config = fio_conf_template.format(ioengine=ioengine, spdk_conf=spdk_conf, rw=rw, rwmixread=rwmixread, block_size=block_size, - io_depth=io_depth, ramp_time=ramp_time, run_time=run_time) + ramp_time=ramp_time, run_time=run_time) if num_jobs: fio_config = fio_config + "numjobs=%s \n" % num_jobs if self.cpus_allowed is not None: @@ -418,56 +475,17 @@ def run_fio(self, fio_config_file, run_num=None): class KernelTarget(Target): def __init__(self, name, username, password, mode, nic_ips, transport="rdma", - use_null_block=False, sar_settings=None, pcm_settings=None, - nvmet_bin="nvmetcli", **kwargs): + null_block_devices=0, sar_settings=None, pcm_settings=None, + bandwidth_settings=None, dpdk_settings=None, nvmet_bin="nvmetcli", **kwargs): super(KernelTarget, self).__init__(name, username, password, mode, nic_ips, transport, - use_null_block, sar_settings, pcm_settings) + null_block_devices, sar_settings, pcm_settings, bandwidth_settings, + dpdk_settings) self.nvmet_bin = nvmet_bin def __del__(self): nvmet_command(self.nvmet_bin, "clear") - def kernel_tgt_gen_nullblock_conf(self, address): - nvmet_cfg = { - "ports": [], - "hosts": [], - "subsystems": [], - } - - nvmet_cfg["subsystems"].append({ - "allowed_hosts": [], - "attr": { - "allow_any_host": "1", - "version": "1.3" - }, - "namespaces": [ - { - "device": { - "path": "/dev/nullb0", - "uuid": "%s" % uuid.uuid4() - }, - "enable": 1, - "nsid": 1 - } - ], - "nqn": "nqn.2018-09.io.spdk:cnode1" - }) - - nvmet_cfg["ports"].append({ - "addr": { - "adrfam": "ipv4", - "traddr": address, - "trsvcid": "4420", - "trtype": "%s" % self.transport, - }, - "portid": 1, - "referrals": [], - "subsystems": ["nqn.2018-09.io.spdk:cnode1"] - }) - with open("kernel.conf", 'w') as fh: - fh.write(json.dumps(nvmet_cfg, indent=2)) - def kernel_tgt_gen_subsystem_conf(self, nvme_list, address_list): nvmet_cfg = { @@ -488,6 +506,7 @@ def kernel_tgt_gen_subsystem_conf(self, nvme_list, address_list): "allowed_hosts": [], "attr": { "allow_any_host": "1", + "serial": "SPDK00%s" % subsys_no, "version": "1.3" }, "namespaces": [ @@ -526,12 +545,9 @@ def tgt_start(self): if self.null_block: print("Configuring with null block device.") - if len(self.nic_ips) > 1: - print("Testing with null block limited to single RDMA NIC.") - print("Please specify only 1 IP address.") - exit(1) - self.subsys_no = 1 - self.kernel_tgt_gen_nullblock_conf(self.nic_ips[0]) + null_blk_list = ["/dev/nullb{}".format(x) for x in range(self.null_block)] + self.kernel_tgt_gen_subsystem_conf(null_blk_list, self.nic_ips) + self.subsys_no = len(null_blk_list) else: print("Configuring with NVMe drives.") nvme_list = get_nvme_devices() @@ -546,34 +562,48 @@ def tgt_start(self): class SPDKTarget(Target): def __init__(self, name, username, password, mode, nic_ips, transport="rdma", - use_null_block=False, sar_settings=None, pcm_settings=None, - num_shared_buffers=4096, num_cores=1, **kwargs): + null_block_devices=0, null_block_dif_type=0, sar_settings=None, pcm_settings=None, + bandwidth_settings=None, dpdk_settings=None, num_shared_buffers=4096, + num_cores=1, dif_insert_strip=False, **kwargs): super(SPDKTarget, self).__init__(name, username, password, mode, nic_ips, transport, - use_null_block, sar_settings, pcm_settings) + null_block_devices, sar_settings, pcm_settings, bandwidth_settings, + dpdk_settings) self.num_cores = num_cores self.num_shared_buffers = num_shared_buffers + self.null_block_dif_type = null_block_dif_type + self.dif_insert_strip = dif_insert_strip def spdk_tgt_configure(self): self.log_print("Configuring SPDK NVMeOF target via RPC") numa_list = get_used_numa_nodes() # Create RDMA transport layer - rpc.nvmf.nvmf_create_transport(self.client, trtype=self.transport, num_shared_buffers=self.num_shared_buffers) + rpc.nvmf.nvmf_create_transport(self.client, trtype=self.transport, + num_shared_buffers=self.num_shared_buffers, + dif_insert_or_strip=self.dif_insert_strip) self.log_print("SPDK NVMeOF transport layer:") rpc.client.print_dict(rpc.nvmf.nvmf_get_transports(self.client)) if self.null_block: - nvme_section = self.spdk_tgt_add_nullblock() - subsystems_section = self.spdk_tgt_add_subsystem_conf(self.nic_ips, req_num_disks=1) + nvme_section = self.spdk_tgt_add_nullblock(self.null_block) + subsystems_section = self.spdk_tgt_add_subsystem_conf(self.nic_ips, self.null_block) else: nvme_section = self.spdk_tgt_add_nvme_conf() subsystems_section = self.spdk_tgt_add_subsystem_conf(self.nic_ips) self.log_print("Done configuring SPDK NVMeOF Target") - def spdk_tgt_add_nullblock(self): - self.log_print("Adding null block bdev to config via RPC") - rpc.bdev.bdev_null_create(self.client, 102400, 4096, "Nvme0n1") + def spdk_tgt_add_nullblock(self, null_block_count): + md_size = 0 + block_size = 4096 + if self.null_block_dif_type != 0: + md_size = 128 + + self.log_print("Adding null block bdevices to config via RPC") + for i in range(null_block_count): + self.log_print("Setting bdev protection to :%s" % self.null_block_dif_type) + rpc.bdev.bdev_null_create(self.client, 102400, block_size + md_size, "Nvme{}n1".format(i), + dif_type=self.null_block_dif_type, md_size=md_size) self.log_print("SPDK Bdevs configuration:") rpc.client.print_dict(rpc.bdev.bdev_get_bdevs(self.client)) @@ -629,7 +659,10 @@ def spdk_tgt_add_subsystem_conf(self, ips=None, req_num_disks=None): rpc.client.print_dict(rpc.nvmf.nvmf_get_subsystems(self.client)) def tgt_start(self): - self.subsys_no = get_nvme_devices_count() + if self.null_block: + self.subsys_no = 1 + else: + self.subsys_no = get_nvme_devices_count() self.log_print("Starting SPDK NVMeOF Target process") nvmf_app_path = os.path.join(self.spdk_dir, "build/bin/nvmf_tgt") command = " ".join([nvmf_app_path, "-m", self.num_cores]) @@ -693,15 +726,14 @@ def kernel_init_disconnect(self, address_list, subsys_no): self.remote_call("sudo %s disconnect -n %s" % (self.nvmecli_bin, subsystem[1])) time.sleep(1) - def gen_fio_filename_conf(self, threads): - out, err = self.remote_call("lsblk -o NAME -nlp") + def gen_fio_filename_conf(self, threads, io_depth, num_jobs=1): + out, err = self.remote_call("sudo nvme list | grep -E 'SPDK|Linux' | awk '{print $1}'") nvme_list = [x for x in out.split("\n") if "nvme" in x] filename_section = "" - filenames = ["nvme%sn1" % x for x in range(0, len(nvme_list))] nvme_per_split = int(len(nvme_list) / len(threads)) remainder = len(nvme_list) % len(threads) - iterator = iter(filenames) + iterator = iter(nvme_list) result = [] for i in range(len(threads)): result.append([]) @@ -712,8 +744,12 @@ def gen_fio_filename_conf(self, threads): remainder -= 1 for i, r in enumerate(result): header = "[filename%s]" % i - disks = "\n".join(["filename=/dev/%s" % x for x in r]) - filename_section = "\n".join([filename_section, header, disks]) + disks = "\n".join(["filename=%s" % x for x in r]) + job_section_qd = round((io_depth * len(r)) / num_jobs) + if job_section_qd == 0: + job_section_qd = 1 + iodepth = "iodepth=%s" % job_section_qd + filename_section = "\n".join([filename_section, header, disks, iodepth]) return filename_section @@ -735,8 +771,8 @@ def install_spdk(self, local_spdk_zip): self.log_print("Sources unpacked") self.log_print("Using fio binary %s" % self.fio_bin) - self.remote_call("cd %s; git submodule update --init; ./configure --with-rdma --with-fio=%s;" - "make clean; make -j$(($(nproc)*2))" % (self.spdk_dir, os.path.dirname(self.fio_bin))) + self.remote_call("cd %s; git submodule update --init; make clean; ./configure --with-rdma --with-fio=%s;" + "make -j$(($(nproc)*2))" % (self.spdk_dir, os.path.dirname(self.fio_bin))) self.log_print("SPDK built") self.remote_call("sudo %s/scripts/setup.sh" % self.spdk_dir) @@ -754,7 +790,7 @@ def gen_spdk_bdev_conf(self, remote_subsystem_list): bdev_section = "\n".join([header, bdev_rows]) return bdev_section - def gen_fio_filename_conf(self, subsystems, threads): + def gen_fio_filename_conf(self, subsystems, threads, io_depth, num_jobs=1): filename_section = "" if len(threads) >= len(subsystems): threads = range(0, len(subsystems)) @@ -773,7 +809,11 @@ def gen_fio_filename_conf(self, subsystems, threads): for i, r in enumerate(result): header = "[filename%s]" % i disks = "\n".join(["filename=%s" % x for x in r]) - filename_section = "\n".join([filename_section, header, disks]) + job_section_qd = round((io_depth * len(r)) / num_jobs) + if job_section_qd == 0: + job_section_qd = 1 + iodepth = "iodepth=%s" % job_section_qd + filename_section = "\n".join([filename_section, header, disks, iodepth]) return filename_section @@ -821,15 +861,16 @@ def gen_fio_filename_conf(self, subsystems, threads): continue # Copy and install SPDK on remote initiators - target_obj.zip_spdk_sources(target_obj.spdk_dir, spdk_zip_path) - threads = [] - for i in initiators: - if i.mode == "spdk": - t = threading.Thread(target=i.install_spdk, args=(spdk_zip_path,)) - threads.append(t) - t.start() - for t in threads: - t.join() + if "skip_spdk_install" not in data["general"]: + target_obj.zip_spdk_sources(target_obj.spdk_dir, spdk_zip_path) + threads = [] + for i in initiators: + if i.mode == "spdk": + t = threading.Thread(target=i.install_spdk, args=(spdk_zip_path,)) + threads.append(t) + t.start() + for t in threads: + t.join() target_obj.tgt_start() @@ -867,6 +908,16 @@ def gen_fio_filename_conf(self, subsystems, threads): t = threading.Thread(target=target_obj.measure_pcm_memory, args=(target_results_dir, pcm_file_name,)) threads.append(t) + if target_obj.enable_bandwidth: + bandwidth_file_name = "_".join(["bandwidth", str(block_size), str(rw), str(io_depth)]) + bandwidth_file_name = ".".join([bandwidth_file_name, "csv"]) + t = threading.Thread(target=target_obj.measure_bandwidth, args=(target_results_dir, bandwidth_file_name,)) + threads.append(t) + + if target_obj.enable_dpdk_memory: + t = threading.Thread(target=target_obj.measure_dpdk_memory, args=(target_results_dir)) + threads.append(t) + for t in threads: t.start() for t in threads: diff --git a/scripts/pkgdep.sh b/scripts/pkgdep.sh index 4b1ca7477b2..3045b6a24ef 100755 --- a/scripts/pkgdep.sh +++ b/scripts/pkgdep.sh @@ -16,6 +16,7 @@ function usage() { echo " -f --fuse Additional dependencies for FUSE and NVMe-CUSE" echo " -r --rdma Additional dependencies for RDMA transport in NVMe over Fabrics" echo " -b --docs Additional dependencies for building docs" + echo " -u --uring Additional dependencies for io_uring" echo "" exit 0 } @@ -26,53 +27,7 @@ function install_all_dependencies() { INSTALL_FUSE=true INSTALL_RDMA=true INSTALL_DOCS=true -} - -function install_shfmt() { - # Fetch version that has been tested - local shfmt_version=3.1.0 - local shfmt=shfmt-$shfmt_version - local shfmt_dir=${SHFMT_DIR:-/opt/shfmt} - local shfmt_dir_out=${SHFMT_DIR_OUT:-/usr/bin} - local shfmt_url - local os - - if hash "$shfmt" && [[ $("$shfmt" --version) == "v$shfmt_version" ]]; then - echo "$shfmt already installed" - return 0 - fi 2> /dev/null - - os=$(uname -s) - - case "$os" in - Linux) shfmt_url=https://github.com/mvdan/sh/releases/download/v$shfmt_version/shfmt_v${shfmt_version}_linux_amd64 ;; - FreeBSD) shfmt_url=https://github.com/mvdan/sh/releases/download/v$shfmt_version/shfmt_v${shfmt_version}_freebsd_amd64 ;; - *) - echo "Not supported OS (${os:-Unknown}), skipping" - return 0 - ;; - esac - - mkdir -p "$shfmt_dir" - mkdir -p "$shfmt_dir_out" - - echo "Fetching ${shfmt_url##*/}"... - local err - if err=$(curl -f -Lo"$shfmt_dir/$shfmt" "$shfmt_url" 2>&1); then - chmod +x "$shfmt_dir/$shfmt" - ln -sf "$shfmt_dir/$shfmt" "$shfmt_dir_out" - else - cat <<- CURL_ERR - - * Fetching $shfmt_url failed, $shfmt will not be available for format check. - * Error: - - $err - - CURL_ERR - return 0 - fi - echo "$shfmt installed" + INSTALL_LIBURING=true } INSTALL_CRYPTO=false @@ -81,8 +36,9 @@ INSTALL_PMEM=false INSTALL_FUSE=false INSTALL_RDMA=false INSTALL_DOCS=false +INSTALL_LIBURING=false -while getopts 'abdfhipr-:' optchar; do +while getopts 'abdfhipru-:' optchar; do case "$optchar" in -) case "$OPTARG" in @@ -93,6 +49,7 @@ while getopts 'abdfhipr-:' optchar; do fuse) INSTALL_FUSE=true ;; rdma) INSTALL_RDMA=true ;; docs) INSTALL_DOCS=true ;; + uring) INSTALL_LIBURING=true ;; *) echo "Invalid argument '$OPTARG'" usage @@ -106,6 +63,7 @@ while getopts 'abdfhipr-:' optchar; do f) INSTALL_FUSE=true ;; r) INSTALL_RDMA=true ;; b) INSTALL_DOCS=true ;; + u) INSTALL_LIBURING=true ;; *) echo "Invalid argument '$OPTARG'" usage @@ -126,8 +84,14 @@ fi ID=${ID:-$OS} ID=${ID,,} +#Link suse related OS to sles +if [[ ${ID,,} == *"suse"* ]]; then + ID="sles" +fi + if [[ -e $scriptsdir/pkgdep/$ID.sh ]]; then source "$scriptsdir/pkgdep/$ID.sh" + source "$scriptsdir/pkgdep/common.sh" else printf 'Not supported platform detected (%s), aborting\n' "$ID" >&2 fi diff --git a/scripts/pkgdep/arch.sh b/scripts/pkgdep/arch.sh index 32944b92582..a43dbd96a28 100755 --- a/scripts/pkgdep/arch.sh +++ b/scripts/pkgdep/arch.sh @@ -4,7 +4,7 @@ pacman -Sy --needed --noconfirm gcc make cunit libaio openssl \ libutil-linux libiscsi python ncurses ninja meson # Additional dependencies for SPDK CLI -pacman -Sy --needed --noconfirm python-pexpect python-pip +pacman -Sy --needed --noconfirm python-pexpect python-pip libffi pip install configshell_fb # Additional dependencies for DPDK pacman -Sy --needed --noconfirm numactl nasm @@ -32,7 +32,6 @@ if [[ $INSTALL_DEV_TOOLS == "true" ]]; then makepkg -si --needed --noconfirm; cd .. && rm -rf lcov-git; popd" - install_shfmt fi if [[ $INSTALL_PMEM == "true" ]]; then # Additional dependencies for building pmem based backends diff --git a/scripts/pkgdep/clear-linux-os.sh b/scripts/pkgdep/clear-linux-os.sh index d9583c2ece7..da155f2d9d4 100755 --- a/scripts/pkgdep/clear-linux-os.sh +++ b/scripts/pkgdep/clear-linux-os.sh @@ -3,7 +3,7 @@ # Install main dependencies swupd bundle-add -y c-basic make dev-utils openssl devpkg-libiscsi \ devpkg-ncurses python3-basic python-extras devpkg-open-iscsi \ - jq storage-utils + storage-utils # Additional dependencies for ISA-L used in compression swupd bundle-add -y dev-utils-dev # Additional dependencies for DPDK @@ -13,7 +13,6 @@ pip3 install pexpect pip3 install configshell_fb if [[ $INSTALL_DEV_TOOLS == "true" ]]; then swupd bundle-add -y git os-testsuite-0day - install_shfmt fi if [[ $INSTALL_PMEM == "true" ]]; then # Additional dependencies for building pmem based backends diff --git a/scripts/pkgdep/common.sh b/scripts/pkgdep/common.sh new file mode 100755 index 00000000000..80ab9bfe168 --- /dev/null +++ b/scripts/pkgdep/common.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash + +install_liburing() { + local GIT_REPO_LIBURING=https://github.com/axboe/liburing.git + local liburing_dir=/usr/local/src/liburing + + if [[ $(ldconfig -p) == *liburing.so* ]]; then + echo "liburing is already installed. skipping" + else + if [[ -d $liburing_dir ]]; then + echo "liburing source already present, not cloning" + else + mkdir -p $liburing_dir + git clone "${GIT_REPO_LIBURING}" "$liburing_dir" + fi + (cd "$liburing_dir" && ./configure --libdir=/usr/lib64 && make install) + fi +} + +install_shfmt() { + # Fetch version that has been tested + local shfmt_version=3.1.0 + local shfmt=shfmt-$shfmt_version + local shfmt_dir=${SHFMT_DIR:-/opt/shfmt} + local shfmt_dir_out=${SHFMT_DIR_OUT:-/usr/bin} + local shfmt_url + local os + + if hash "$shfmt" && [[ $("$shfmt" --version) == "v$shfmt_version" ]]; then + echo "$shfmt already installed" + return 0 + fi 2> /dev/null + + os=$(uname -s) + + case "$os" in + Linux) shfmt_url=https://github.com/mvdan/sh/releases/download/v$shfmt_version/shfmt_v${shfmt_version}_linux_amd64 ;; + FreeBSD) shfmt_url=https://github.com/mvdan/sh/releases/download/v$shfmt_version/shfmt_v${shfmt_version}_freebsd_amd64 ;; + *) + echo "Not supported OS (${os:-Unknown}), skipping" + return 0 + ;; + esac + + mkdir -p "$shfmt_dir" + mkdir -p "$shfmt_dir_out" + + echo "Fetching ${shfmt_url##*/}"... + local err + if err=$(curl -f -Lo"$shfmt_dir/$shfmt" "$shfmt_url" 2>&1); then + chmod +x "$shfmt_dir/$shfmt" + ln -sf "$shfmt_dir/$shfmt" "$shfmt_dir_out" + else + cat <<- CURL_ERR + + * Fetching $shfmt_url failed, $shfmt will not be available for format check. + * Error: + + $err + + CURL_ERR + return 0 + fi + echo "$shfmt installed" +} + +install_spdk_bash_completion() { + [[ -e /usr/share/bash-completion/bash_completion ]] || return 0 + + local compat_dir=/etc/bash_completion.d + mkdir -p "$compat_dir" + + if [[ ! -e $compat_dir/spdk ]]; then + ln -vs "$scriptsdir/bash-completion/spdk" "$compat_dir" + fi +} + +if [[ $INSTALL_DEV_TOOLS == true ]]; then + install_shfmt + install_spdk_bash_completion +fi + +if [[ $INSTALL_LIBURING == true ]]; then + install_liburing +fi diff --git a/scripts/pkgdep/debian.sh b/scripts/pkgdep/debian.sh index 155f6e5cdc1..fd239602e73 100755 --- a/scripts/pkgdep/debian.sh +++ b/scripts/pkgdep/debian.sh @@ -23,12 +23,11 @@ apt-get install -y libnuma-dev apt-get install -y autoconf automake libtool help2man if [[ $INSTALL_DEV_TOOLS == "true" ]]; then # Tools for developers - apt-get install -y git astyle pep8 lcov clang sg3-utils pciutils shellcheck + apt-get install -y git astyle pep8 lcov clang sg3-utils pciutils shellcheck abigail-tools # Additional python style checker not available on ubuntu 16.04 or earlier. apt-get install -y pycodestyle || true # Additional dependecies for nvmf performance test script apt-get install -y python3-paramiko - install_shfmt fi if [[ $INSTALL_PMEM == "true" ]]; then # Additional dependencies for building pmem based backends diff --git a/scripts/pkgdep/rhel.sh b/scripts/pkgdep/rhel.sh index d751cedf327..4f44637ff0e 100755 --- a/scripts/pkgdep/rhel.sh +++ b/scripts/pkgdep/rhel.sh @@ -1,23 +1,91 @@ #!/usr/bin/env bash +disclaimer() { + case "$ID" in + rhel) + cat <<- WARN + + WARNING: $PRETTY_NAME system detected. + + Please, note that the support for this platform is considered to be "best-effort", + as in, access to some packages may be limited and/or missing. Review your repo + setup to make sure installation of all dependencies is possible. + + WARN + + # Don't trigger errexit, simply install what's available. This is default + # behavior of older yum versions (e.g. the one present on RHEL 7.x) anyway. + yum() { "$(type -P yum)" --skip-broken "$@"; } + ;; + + *) ;; + esac +} + +disclaimer + +# First, add extra EPEL, ELRepo, Ceph repos to have a chance of covering most of the packages +# on the enterprise systems, like RHEL. +if [[ $ID == centos || $ID == rhel ]]; then + repos=() enable=("epel" "elrepo" "elrepo-testing") + [[ $ID == centos ]] && enable+=("extras") + if [[ $VERSION_ID == 7* ]]; then + repos+=("https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm") + repos+=("https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm") + [[ $ID == centos ]] && repos+=("centos-release-ceph-nautilus.noarch") + # Disable liburing, see https://github.com/spdk/spdk/issues/1564 + if [[ $INSTALL_LIBURING == true ]]; then + echo "Liburing not supported on ${ID}$VERSION_ID, disabling" + INSTALL_LIBURING=false + fi + fi + if [[ $VERSION_ID == 8* ]]; then + repos+=("https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm") + repos+=("https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm") + [[ $ID == centos ]] && repos+=("centos-release-ceph-nautilus.noarch") + # Add PowerTools needed for install CUnit-devel in Centos8 + [[ $ID == centos ]] && repos+=("yum-utils") && enable+=("PowerTools") + fi + if ((${#repos[@]} > 0)); then + yum install -y "${repos[@]}" + yum-config-manager --enable "${enable[@]}" + fi + # Potential dependencies can be needed from other RHEL repos, enable them + if [[ $ID == rhel ]]; then + [[ $VERSION_ID == 7* ]] && subscription-manager repos --enable "rhel-*-optional-rpms" --enable "rhel-*-extras-rpms" + [[ $VERSION_ID == 8* ]] && subscription-manager repos --enable codeready-builder-for-rhel-8-x86_64-rpms + fi +fi + # Minimal install -if echo "$ID $VERSION_ID" | grep -E -q 'centos 8'; then - # Add PowerTools needed for install CUnit-devel in Centos8 - yum install -y yum-utils - yum config-manager --set-enabled PowerTools +# workaround for arm: ninja fails with dep on skbuild python module +if [ "$(uname -m)" = "aarch64" ]; then + pip3 install scikit-build + if echo "$ID $VERSION_ID" | grep -E -q 'centos 7'; then + # by default centos 7.x uses cmake 2.8 while ninja requires 3.6 or higher + yum install -y cmake3 + # cmake3 is installed as /usr/bin/cmake3 while ninja directly calls `cmake`. Create a soft link + # as a workaround + mkdir -p /tmp/bin/ + ln -s /usr/bin/cmake3 /tmp/bin/cmake > /dev/null 2>&1 || true + export PATH=/tmp/bin:$PATH + fi fi + yum install -y gcc gcc-c++ make CUnit-devel libaio-devel openssl-devel \ libuuid-devel libiscsi-devel ncurses-devel -if echo "$ID $VERSION_ID" | grep -E -q 'centos 8'; then +if echo "$ID $VERSION_ID" | grep -E -q 'centos 8|rhel 8'; then yum install -y python36 #Create hard link to use in SPDK as python - ln /etc/alternatives/python3 /usr/bin/python || true + if [[ ! -e /usr/bin/python && -e /etc/alternative/python3 ]]; then + ln -s /etc/alternatives/python3 /usr/bin/python + fi else yum install -y python fi yum install -y python3-pip -pip-3 install ninja -pip-3 install meson +pip3 install ninja +pip3 install meson # Additional dependencies for SPDK CLI - not available in rhel and centos if ! echo "$ID $VERSION_ID" | grep -E -q 'rhel 7|centos 7'; then @@ -29,27 +97,13 @@ yum install -y autoconf automake libtool help2man yum install -y numactl-devel nasm if [[ $INSTALL_DEV_TOOLS == "true" ]]; then # Tools for developers - # Includes Fedora, CentOS 7, RHEL 7 - # Add EPEL repository for CUnit-devel - if echo "$ID $VERSION_ID" | grep -E -q 'rhel 7|centos 7|centos 8'; then - if ! rpm --quiet -q epel-release; then - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm - fi - - if [[ $ID = 'rhel' ]]; then - subscription-manager repos --enable "rhel-*-optional-rpms" --enable "rhel-*-extras-rpms" - elif [[ $ID = 'centos' ]]; then - yum --enablerepo=extras install -y epel-release - fi - fi if echo "$ID $VERSION_ID" | grep -E -q 'centos 8'; then yum install -y python3-pycodestyle echo "Centos 8 does not have lcov and ShellCheck dependencies" else yum install -y python-pycodestyle lcov ShellCheck fi - yum install -y git astyle sg3_utils pciutils - install_shfmt + yum install -y git astyle sg3_utils pciutils libabigail fi if [[ $INSTALL_PMEM == "true" ]]; then # Additional dependencies for building pmem based backends diff --git a/scripts/pkgdep/sles.sh b/scripts/pkgdep/sles.sh index c5d14b67eb5..0c743d254a5 100755 --- a/scripts/pkgdep/sles.sh +++ b/scripts/pkgdep/sles.sh @@ -11,7 +11,6 @@ if [[ $INSTALL_DEV_TOOLS == "true" ]]; then # Tools for developers zypper install -y git-core lcov python-pycodestyle sg3_utils \ pciutils ShellCheck - install_shfmt fi if [[ $INSTALL_PMEM == "true" ]]; then # Additional dependencies for building pmem based backends diff --git a/scripts/rpc.py b/scripts/rpc.py index 4e3e519d28c..0abefd189e2 100755 --- a/scripts/rpc.py +++ b/scripts/rpc.py @@ -174,15 +174,26 @@ def bdev_set_options(args): p.set_defaults(bdev_auto_examine=True) p.set_defaults(func=bdev_set_options) + def bdev_examine(args): + rpc.bdev.bdev_examine(args.client, + name=args.name) + + p = subparsers.add_parser('bdev_examine', + help="""examine a bdev if it exists, or will examine it after it is created""") + p.add_argument('-b', '--name', help='Name or alias of the bdev') + p.set_defaults(func=bdev_examine) + def bdev_compress_create(args): print_json(rpc.bdev.bdev_compress_create(args.client, base_bdev_name=args.base_bdev_name, - pm_path=args.pm_path)) + pm_path=args.pm_path, + lb_size=args.lb_size)) p = subparsers.add_parser('bdev_compress_create', aliases=['construct_compress_bdev'], help='Add a compress vbdev') p.add_argument('-b', '--base_bdev_name', help="Name of the base bdev") p.add_argument('-p', '--pm_path', help="Path to persistent memory") + p.add_argument('-l', '--lb_size', help="Compressed vol logical block size (optional, if used must be 512 or 4096)", type=int, default=0) p.set_defaults(func=bdev_compress_create) def bdev_compress_delete(args): @@ -194,13 +205,13 @@ def bdev_compress_delete(args): p.add_argument('name', help='compress bdev name') p.set_defaults(func=bdev_compress_delete) - def compress_set_pmd(args): - rpc.bdev.compress_set_pmd(args.client, - pmd=args.pmd) - p = subparsers.add_parser('compress_set_pmd', aliases=['set_compress_pmd'], + def bdev_compress_set_pmd(args): + rpc.bdev.bdev_compress_set_pmd(args.client, + pmd=args.pmd) + p = subparsers.add_parser('bdev_compress_set_pmd', aliases=['set_compress_pmd', 'compress_set_pmd'], help='Set pmd option for a compress disk') p.add_argument('-p', '--pmd', type=int, help='0 = auto-select, 1= QAT only, 2 = ISAL only') - p.set_defaults(func=compress_set_pmd) + p.set_defaults(func=bdev_compress_set_pmd) def bdev_compress_get_orphans(args): print_dict(rpc.bdev.bdev_compress_get_orphans(args.client, @@ -241,12 +252,21 @@ def bdev_ocf_create(args): print_json(rpc.bdev.bdev_ocf_create(args.client, name=args.name, mode=args.mode, + cache_line_size=args.cache_line_size, cache_bdev_name=args.cache_bdev_name, core_bdev_name=args.core_bdev_name)) p = subparsers.add_parser('bdev_ocf_create', aliases=['construct_ocf_bdev'], help='Add an OCF block device') p.add_argument('name', help='Name of resulting OCF bdev') p.add_argument('mode', help='OCF cache mode', choices=['wb', 'wt', 'pt', 'wa', 'wi', 'wo']) + p.add_argument( + '--cache-line-size', + help='OCF cache line size. The unit is KiB', + type=int, + choices=[4, 8, 16, 32, 64], + required=False, + default=0, + ) p.add_argument('cache_bdev_name', help='Name of underlying cache bdev') p.add_argument('core_bdev_name', help='Name of unerlying core bdev') p.set_defaults(func=bdev_ocf_create) @@ -303,6 +323,9 @@ def bdev_malloc_delete(args): def bdev_null_create(args): num_blocks = (args.total_size * 1024 * 1024) // args.block_size + if args.dif_type and not args.md_size: + print("ERROR: --md-size must be > 0 when --dif-type is > 0") + exit(1) print_json(rpc.bdev.bdev_null_create(args.client, num_blocks=num_blocks, block_size=args.block_size, @@ -316,15 +339,16 @@ def bdev_null_create(args): help='Add a bdev with null backend') p.add_argument('name', help='Block device name') p.add_argument('-u', '--uuid', help='UUID of the bdev') - p.add_argument( - 'total_size', help='Size of null bdev in MB (int > 0)', type=int) - p.add_argument('block_size', help='Block size for this bdev', type=int) + p.add_argument('total_size', help='Size of null bdev in MB (int > 0). Includes only data blocks.', type=int) + p.add_argument('block_size', help='Block size for this bdev.' + 'Should be a sum of block size and metadata size if --md-size is used.', type=int) p.add_argument('-m', '--md-size', type=int, - help='Metadata size for this bdev. Default 0') - p.add_argument('-t', '--dif-type', type=int, choices=[0, 1, 2, 3], - help='Protection information type. Default: 0 - no protection') + help='Metadata size for this bdev. Default=0.') + p.add_argument('-t', '--dif-type', type=int, default=0, choices=[0, 1, 2, 3], + help='Protection information type. Parameter --md-size needs' + 'to be set along --dif-type. Default=0 - no protection.') p.add_argument('-d', '--dif-is-head-of-md', action='store_true', - help='Protection information is in the first 8 bytes of metadata. Default: in the last 8 bytes') + help='Protection information is in the first 8 bytes of metadata. Default=false.') p.set_defaults(func=bdev_null_create) def bdev_null_delete(args): @@ -482,11 +506,25 @@ def bdev_nvme_get_controllers(args): def bdev_nvme_detach_controller(args): rpc.bdev.bdev_nvme_detach_controller(args.client, - name=args.name) + name=args.name, + trtype=args.trtype, + traddr=args.traddr, + adrfam=args.adrfam, + trsvcid=args.trsvcid, + subnqn=args.subnqn) p = subparsers.add_parser('bdev_nvme_detach_controller', aliases=['delete_nvme_controller'], help='Detach an NVMe controller and delete any associated bdevs') p.add_argument('name', help="Name of the controller") + p.add_argument('-t', '--trtype', + help='NVMe-oF target trtype: e.g., rdma, pcie') + p.add_argument('-a', '--traddr', + help='NVMe-oF target address: e.g., an ip address or BDF') + p.add_argument('-f', '--adrfam', + help='NVMe-oF target adrfam: e.g., ipv4, ipv6, ib, fc, intra_host') + p.add_argument('-s', '--trsvcid', + help='NVMe-oF target trsvcid: e.g., a port number') + p.add_argument('-n', '--subnqn', help='NVMe-oF target subnqn') p.set_defaults(func=bdev_nvme_detach_controller) def bdev_nvme_cuse_register(args): @@ -767,7 +805,7 @@ def bdev_set_qos_limit(args): help='Set QoS rate limit on a blockdev') p.add_argument('name', help='Blockdev name to set QoS. Example: Malloc0') p.add_argument('--rw_ios_per_sec', - help='R/W IOs per second limit (>=10000, example: 20000). 0 means unlimited.', + help='R/W IOs per second limit (>=1000, example: 20000). 0 means unlimited.', type=int, required=False) p.add_argument('--rw_mbytes_per_sec', help="R/W megabytes per second limit (>=10, example: 100). 0 means unlimited.", @@ -827,7 +865,9 @@ def iscsi_set_options(args): first_burst_length=args.first_burst_length, immediate_data=args.immediate_data, error_recovery_level=args.error_recovery_level, - allow_duplicated_isid=args.allow_duplicated_isid) + allow_duplicated_isid=args.allow_duplicated_isid, + max_large_datain_per_connection=args.max_large_datain_per_connection, + max_r2t_per_connection=args.max_r2t_per_connection) p = subparsers.add_parser('iscsi_set_options', aliases=['set_iscsi_options'], help="""Set options of iSCSI subsystem""") @@ -851,6 +891,8 @@ def iscsi_set_options(args): p.add_argument('-i', '--immediate-data', help='Negotiated parameter, ImmediateData.', action='store_true') p.add_argument('-l', '--error-recovery-level', help='Negotiated parameter, ErrorRecoveryLevel', type=int) p.add_argument('-p', '--allow-duplicated-isid', help='Allow duplicated initiator session ID.', action='store_true') + p.add_argument('-x', '--max-large-datain-per-connection', help='Max number of outstanding split read I/Os per connection', type=int) + p.add_argument('-k', '--max-r2t-per-connection', help='Max number of outstanding R2Ts per connection', type=int) p.set_defaults(func=iscsi_set_options) def iscsi_set_discovery_auth(args): @@ -1085,6 +1127,36 @@ def iscsi_target_node_remove_pg_ig_maps(args): *** The Portal/Initiator Groups must be precreated ***""") p.set_defaults(func=iscsi_target_node_remove_pg_ig_maps) + def iscsi_target_node_set_redirect(args): + rpc.iscsi.iscsi_target_node_set_redirect( + args.client, + name=args.name, + pg_tag=args.pg_tag, + redirect_host=args.redirect_host, + redirect_port=args.redirect_port) + + p = subparsers.add_parser('iscsi_target_node_set_redirect', + help="""Update redirect portal of the public portal group for the target node. + Omit redirect host and port to clear previously set redirect settings.""") + p.add_argument('name', help='Target node name (ASCII)') + p.add_argument('pg_tag', help='Portal group tag (unique, integer > 0)', type=int) + p.add_argument('-a', '--redirect_host', help='Numeric IP address for redirect portal', required=False) + p.add_argument('-p', '--redirect_port', help='Numeric TCP port for redirect portal', required=False) + p.set_defaults(func=iscsi_target_node_set_redirect) + + def iscsi_target_node_request_logout(args): + rpc.iscsi.iscsi_target_node_request_logout( + args.client, + name=args.name, + pg_tag=args.pg_tag) + + p = subparsers.add_parser('iscsi_target_node_request_logout', + help="""For the target node, request connections whose portal group tag + match to logout, or request all connections if portal group tag is omitted.""") + p.add_argument('name', help='Target node name (ASCII)') + p.add_argument('-t', '--pg-tag', help='Portal group tag (unique, integer > 0)', type=int, required=False) + p.set_defaults(func=iscsi_target_node_request_logout) + def iscsi_create_portal_group(args): portals = [] for p in args.portal_list.strip().split(' '): @@ -1101,7 +1173,8 @@ def iscsi_create_portal_group(args): rpc.iscsi.iscsi_create_portal_group( args.client, portals=portals, - tag=args.tag) + tag=args.tag, + private=args.private) p = subparsers.add_parser('iscsi_create_portal_group', aliases=['add_portal_group'], help='Add a portal group') @@ -1109,6 +1182,10 @@ def iscsi_create_portal_group(args): 'tag', help='Portal group tag (unique, integer > 0)', type=int) p.add_argument('portal_list', help="""List of portals in host:port format, separated by whitespace Example: '192.168.100.100:3260 192.168.100.100:3261 192.168.100.100:3262""") + p.add_argument('-p', '--private', help="""Public (false) or private (true) portal group. + Private portal groups do not have their portals returned by a discovery session. A public + portal group may optionally specify a redirect portal for non-discovery logins. This redirect + portal must be from a private portal group.""", action='store_true') p.set_defaults(func=iscsi_create_portal_group) def iscsi_create_initiator_group(args): @@ -1726,7 +1803,9 @@ def nvmf_create_transport(args): no_srq=args.no_srq, c2h_success=args.c2h_success, dif_insert_or_strip=args.dif_insert_or_strip, - sock_priority=args.sock_priority) + sock_priority=args.sock_priority, + acceptor_backlog=args.acceptor_backlog, + abort_timeout_sec=args.abort_timeout_sec) p = subparsers.add_parser('nvmf_create_transport', help='Create NVMf transport') p.add_argument('-t', '--trtype', help='Transport type (ex. RDMA)', type=str, required=True) @@ -1746,6 +1825,8 @@ def nvmf_create_transport(args): p.add_argument('-o', '--c2h-success', action='store_false', help='Disable C2H success optimization. Relevant only for TCP transport') p.add_argument('-f', '--dif-insert-or-strip', action='store_true', help='Enable DIF insert/strip. Relevant only for TCP transport') p.add_argument('-y', '--sock-priority', help='The sock priority of the tcp connection. Relevant only for TCP transport', type=int) + p.add_argument('-l', '--acceptor_backlog', help='Pending connections allowed at one time. Relevant only for RDMA transport', type=int) + p.add_argument('-x', '--abort-timeout-sec', help='Abort execution timeout value, in seconds', type=int) p.set_defaults(func=nvmf_create_transport) def nvmf_get_transports(args): @@ -1771,7 +1852,8 @@ def nvmf_create_subsystem(args): serial_number=args.serial_number, model_number=args.model_number, allow_any_host=args.allow_any_host, - max_namespaces=args.max_namespaces) + max_namespaces=args.max_namespaces, + ana_reporting=args.ana_reporting) p = subparsers.add_parser('nvmf_create_subsystem', aliases=['nvmf_subsystem_create'], help='Create an NVMe-oF subsystem') @@ -1786,6 +1868,7 @@ def nvmf_create_subsystem(args): p.add_argument("-a", "--allow-any-host", action='store_true', help="Allow any host to connect (don't enforce host NQN whitelist)") p.add_argument("-m", "--max-namespaces", help="Maximum number of namespaces allowed", type=int, default=0) + p.add_argument("-r", "--ana-reporting", action='store_true', help="Enable ANA reporting feature") p.set_defaults(func=nvmf_create_subsystem) def nvmf_delete_subsystem(args): @@ -1836,6 +1919,26 @@ def nvmf_subsystem_remove_listener(args): p.add_argument('-s', '--trsvcid', help='NVMe-oF transport service id: e.g., a port number') p.set_defaults(func=nvmf_subsystem_remove_listener) + def nvmf_subsystem_listener_set_ana_state(args): + rpc.nvmf.nvmf_subsystem_listener_set_ana_state(args.client, + nqn=args.nqn, + ana_state=args.ana_state, + trtype=args.trtype, + traddr=args.traddr, + tgt_name=args.tgt_name, + adrfam=args.adrfam, + trsvcid=args.trsvcid) + + p = subparsers.add_parser('nvmf_subsystem_listener_set_ana_state', help='Set ANA state of a listener for an NVMe-oF subsystem') + p.add_argument('nqn', help='NVMe-oF subsystem NQN') + p.add_argument('-n', '--ana-state', help='ANA state to set: optimized, non-optimized, or inaccessible', required=True) + p.add_argument('-t', '--trtype', help='NVMe-oF transport type: e.g., rdma', required=True) + p.add_argument('-a', '--traddr', help='NVMe-oF transport address: e.g., an ip address', required=True) + p.add_argument('-p', '--tgt_name', help='The name of the parent NVMe-oF target (optional)', type=str) + p.add_argument('-f', '--adrfam', help='NVMe-oF transport adrfam: e.g., ipv4, ipv6, ib, fc, intra_host') + p.add_argument('-s', '--trsvcid', help='NVMe-oF transport service id: e.g., a port number') + p.set_defaults(func=nvmf_subsystem_listener_set_ana_state) + def nvmf_subsystem_add_ns(args): rpc.nvmf.nvmf_subsystem_add_ns(args.client, nqn=args.nqn, @@ -1907,6 +2010,39 @@ def nvmf_subsystem_allow_any_host(args): p.add_argument('-t', '--tgt_name', help='The name of the parent NVMe-oF target (optional)', type=str) p.set_defaults(func=nvmf_subsystem_allow_any_host) + def nvmf_subsystem_get_controllers(args): + print_dict(rpc.nvmf.nvmf_subsystem_get_controllers(args.client, + nqn=args.nqn, + tgt_name=args.tgt_name)) + + p = subparsers.add_parser('nvmf_subsystem_get_controllers', + help='Display controllers of an NVMe-oF subsystem.') + p.add_argument('nqn', help='NVMe-oF subsystem NQN') + p.add_argument('-t', '--tgt-name', help='The name of the parent NVMe-oF target (optional)', type=str) + p.set_defaults(func=nvmf_subsystem_get_controllers) + + def nvmf_subsystem_get_qpairs(args): + print_dict(rpc.nvmf.nvmf_subsystem_get_qpairs(args.client, + nqn=args.nqn, + tgt_name=args.tgt_name)) + + p = subparsers.add_parser('nvmf_subsystem_get_qpairs', + help='Display queue pairs of an NVMe-oF subsystem.') + p.add_argument('nqn', help='NVMe-oF subsystem NQN') + p.add_argument('-t', '--tgt-name', help='The name of the parent NVMe-oF target (optional)', type=str) + p.set_defaults(func=nvmf_subsystem_get_qpairs) + + def nvmf_subsystem_get_listeners(args): + print_dict(rpc.nvmf.nvmf_subsystem_get_listeners(args.client, + nqn=args.nqn, + tgt_name=args.tgt_name)) + + p = subparsers.add_parser('nvmf_subsystem_get_listeners', + help='Display listeners of an NVMe-oF subsystem.') + p.add_argument('nqn', help='NVMe-oF subsystem NQN') + p.add_argument('-t', '--tgt-name', help='The name of the parent NVMe-oF target (optional)', type=str) + p.set_defaults(func=nvmf_subsystem_get_listeners) + def nvmf_get_stats(args): print_dict(rpc.nvmf.nvmf_get_stats(args.client, tgt_name=args.tgt_name)) @@ -2321,6 +2457,15 @@ def thread_set_cpumask(args): p.add_argument('-m', '--cpumask', help='cpumask for this thread') p.set_defaults(func=thread_set_cpumask) + def log_enable_timestamps(args): + ret = rpc.app.log_enable_timestamps(args.client, + enabled=args.enabled) + p = subparsers.add_parser('log_enable_timestamps', + help='Enable or disable timestamps.') + p.add_argument('-d', '--disable', dest='enabled', default=False, action='store_false', help="Disable timestamps") + p.add_argument('-e', '--enable', dest='enabled', action='store_true', help="Enable timestamps") + p.set_defaults(func=log_enable_timestamps) + def thread_get_pollers(args): print_dict(rpc.app.thread_get_pollers(args.client)) @@ -2380,6 +2525,48 @@ def blobfs_set_cache_size(args): p.add_argument('size_in_mb', help='Cache size for blobfs in megabytes.', type=int) p.set_defaults(func=blobfs_set_cache_size) + # sock + def sock_impl_get_options(args): + print_json(rpc.sock.sock_impl_get_options(args.client, + impl_name=args.impl)) + + p = subparsers.add_parser('sock_impl_get_options', help="""Get options of socket layer implementation""") + p.add_argument('-i', '--impl', help='Socket implementation name, e.g. posix', required=True) + p.set_defaults(func=sock_impl_get_options) + + def sock_impl_set_options(args): + rpc.sock.sock_impl_set_options(args.client, + impl_name=args.impl, + recv_buf_size=args.recv_buf_size, + send_buf_size=args.send_buf_size, + enable_recv_pipe=args.enable_recv_pipe, + enable_zerocopy_send=args.enable_zerocopy_send, + enable_quickack=args.enable_quickack, + enable_placement_id=args.enable_placement_id) + + p = subparsers.add_parser('sock_impl_set_options', help="""Set options of socket layer implementation""") + p.add_argument('-i', '--impl', help='Socket implementation name, e.g. posix', required=True) + p.add_argument('-r', '--recv-buf-size', help='Size of receive buffer on socket in bytes', type=int) + p.add_argument('-s', '--send-buf-size', help='Size of send buffer on socket in bytes', type=int) + p.add_argument('--enable-recv-pipe', help='Enable receive pipe', + action='store_true', dest='enable_recv_pipe') + p.add_argument('--disable-recv-pipe', help='Disable receive pipe', + action='store_false', dest='enable_recv_pipe') + p.add_argument('--enable-zerocopy-send', help='Enable zerocopy on send', + action='store_true', dest='enable_zerocopy_send') + p.add_argument('--disable-zerocopy-send', help='Disable zerocopy on send', + action='store_false', dest='enable_zerocopy_send') + p.add_argument('--enable-quickack', help='Enable quick ACK', + action='store_true', dest='enable_quickack') + p.add_argument('--disable-quickack', help='Disable quick ACK', + action='store_false', dest='enable_quickack') + p.add_argument('--enable-placement_id', help='Enable placement_id', + action='store_true', dest='enable_placement_id') + p.add_argument('--disable-placement_id', help='Disable placement_id', + action='store_false', dest='enable_placement_id') + p.set_defaults(func=sock_impl_set_options, enable_recv_pipe=None, enable_zerocopy_send=None, + enable_quickack=None, enable_placement_id=None) + def check_called_name(name): if name in deprecated_aliases: print("{} is deprecated, use {} instead.".format(name, deprecated_aliases[name]), file=sys.stderr) diff --git a/scripts/rpc/__init__.py b/scripts/rpc/__init__.py index 912e1367690..f764d7ae565 100644 --- a/scripts/rpc/__init__.py +++ b/scripts/rpc/__init__.py @@ -23,6 +23,7 @@ from . import trace from . import vhost from . import vmd +from . import sock from . import client as rpc_client from .helpers import deprecated_alias diff --git a/scripts/rpc/app.py b/scripts/rpc/app.py index 9412de17d64..3d25ee70f5d 100644 --- a/scripts/rpc/app.py +++ b/scripts/rpc/app.py @@ -60,6 +60,19 @@ def thread_set_cpumask(client, id, cpumask): return client.call('thread_set_cpumask', params) +def log_enable_timestamps(client, enabled): + """Enable or disable timestamps. + + Args: + value: on or off + + Returns: + None + """ + params = {'enabled': enabled} + return client.call('log_enable_timestamps', params) + + def thread_get_pollers(client): """Query current pollers. diff --git a/scripts/rpc/bdev.py b/scripts/rpc/bdev.py index 60bd8ffd55d..ea90f7d837b 100644 --- a/scripts/rpc/bdev.py +++ b/scripts/rpc/bdev.py @@ -22,18 +22,32 @@ def bdev_set_options(client, bdev_io_pool_size=None, bdev_io_cache_size=None, bd return client.call('bdev_set_options', params) +def bdev_examine(client, name): + """Examine a bdev manually. If the bdev does not exist yet when this RPC is called, + it will be examined when it is created + + Args: + name: name of the bdev + """ + params = { + 'name': name + } + return client.call('bdev_examine', params) + + @deprecated_alias('construct_compress_bdev') -def bdev_compress_create(client, base_bdev_name, pm_path): +def bdev_compress_create(client, base_bdev_name, pm_path, lb_size): """Construct a compress virtual block device. Args: base_bdev_name: name of the underlying base bdev pm_path: path to persistent memory + lb_size: logical block size for the compressed vol in bytes. Must be 4K or 512. Returns: Name of created virtual block device. """ - params = {'base_bdev_name': base_bdev_name, 'pm_path': pm_path} + params = {'base_bdev_name': base_bdev_name, 'pm_path': pm_path, 'lb_size': lb_size} return client.call('bdev_compress_create', params) @@ -50,7 +64,8 @@ def bdev_compress_delete(client, name): @deprecated_alias('set_compress_pmd') -def compress_set_pmd(client, pmd): +@deprecated_alias('compress_set_pmd') +def bdev_compress_set_pmd(client, pmd): """Set pmd options for the bdev compress. Args: @@ -58,7 +73,7 @@ def compress_set_pmd(client, pmd): """ params = {'pmd': pmd} - return client.call('compress_set_pmd', params) + return client.call('bdev_compress_set_pmd', params) def bdev_compress_get_orphans(client, name=None): @@ -109,19 +124,26 @@ def bdev_crypto_delete(client, name): @deprecated_alias('construct_ocf_bdev') -def bdev_ocf_create(client, name, mode, cache_bdev_name, core_bdev_name): +def bdev_ocf_create(client, name, mode, cache_line_size, cache_bdev_name, core_bdev_name): """Add an OCF block device Args: name: name of constructed OCF bdev mode: OCF cache mode: {'wb', 'wt', 'pt', 'wa', 'wi', 'wo'} + cache_line_size: OCF cache line size. The unit is KiB: {4, 8, 16, 32, 64} cache_bdev_name: name of underlying cache bdev core_bdev_name: name of underlying core bdev Returns: Name of created block device """ - params = {'name': name, 'mode': mode, 'cache_bdev_name': cache_bdev_name, 'core_bdev_name': core_bdev_name} + params = { + 'name': name, + 'mode': mode, + 'cache_line_size': cache_line_size, + 'cache_bdev_name': cache_bdev_name, + 'core_bdev_name': core_bdev_name, + } return client.call('bdev_ocf_create', params) @@ -441,10 +463,10 @@ def bdev_nvme_attach_controller(client, name, trtype, traddr, adrfam=None, trsvc Args: name: bdev name prefix; "n" + namespace ID will be appended to create unique names - trtype: transport type ("PCIe", "RDMA") + trtype: transport type ("PCIe", "RDMA", "FC", "TCP") traddr: transport address (PCI BDF or IP address) - adrfam: address family ("IPv4", "IPv6", "IB", or "FC") (optional for PCIe) - trsvcid: transport service ID (port number for IP-based addresses; optional for PCIe) + adrfam: address family ("IPv4", "IPv6", "IB", or "FC") + trsvcid: transport service ID (port number for IP-based addresses) priority: transport connection priority (Sock priority for TCP-based transports; optional) subnqn: subsystem NQN to connect to (optional) hostnqn: NQN to connect from (optional) @@ -491,14 +513,40 @@ def bdev_nvme_attach_controller(client, name, trtype, traddr, adrfam=None, trsvc @deprecated_alias('delete_nvme_controller') -def bdev_nvme_detach_controller(client, name): - """Detach NVMe controller and delete any associated bdevs. +def bdev_nvme_detach_controller(client, name, trtype=None, traddr=None, + adrfam=None, trsvcid=None, subnqn=None): + """Detach NVMe controller and delete any associated bdevs. Optionally, + If all of the transport ID options are specified, only remove that + transport path from the specified controller. If that is the only + available path for the controller, this will also result in the + controller being detached and the associated bdevs being deleted. Args: name: controller name + trtype: transport type ("PCIe", "RDMA") + traddr: transport address (PCI BDF or IP address) + adrfam: address family ("IPv4", "IPv6", "IB", or "FC") + trsvcid: transport service ID (port number for IP-based addresses) + subnqn: subsystem NQN to connect to (optional) """ params = {'name': name} + + if trtype: + params['trtype'] = trtype + + if traddr: + params['traddr'] = traddr + + if adrfam: + params['adrfam'] = adrfam + + if trsvcid: + params['trsvcid'] = trsvcid + + if subnqn: + params['subnqn'] = subnqn + return client.call('bdev_nvme_detach_controller', params) @@ -1071,7 +1119,7 @@ def bdev_set_qos_limit( Args: name: name of block device - rw_ios_per_sec: R/W IOs per second limit (>=10000, example: 20000). 0 means unlimited. + rw_ios_per_sec: R/W IOs per second limit (>=1000, example: 20000). 0 means unlimited. rw_mbytes_per_sec: R/W megabytes per second limit (>=10, example: 100). 0 means unlimited. r_mbytes_per_sec: Read megabytes per second limit (>=10, example: 100). 0 means unlimited. w_mbytes_per_sec: Write megabytes per second limit (>=10, example: 100). 0 means unlimited. diff --git a/scripts/rpc/iscsi.py b/scripts/rpc/iscsi.py index 6d64b618528..d6f29fdb6dd 100644 --- a/scripts/rpc/iscsi.py +++ b/scripts/rpc/iscsi.py @@ -20,7 +20,9 @@ def iscsi_set_options( first_burst_length=None, immediate_data=None, error_recovery_level=None, - allow_duplicated_isid=None): + allow_duplicated_isid=None, + max_large_datain_per_connection=None, + max_r2t_per_connection=None): """Set iSCSI target options. Args: @@ -41,6 +43,8 @@ def iscsi_set_options( immediate_data: Negotiated parameter, ImmediateData error_recovery_level: Negotiated parameter, ErrorRecoveryLevel allow_duplicated_isid: Allow duplicated initiator session ID + max_large_datain_per_connection: Max number of outstanding split read I/Os per connection (optional) + max_r2t_per_connection: Max number of outstanding R2Ts per connection (optional) Returns: True or False @@ -81,6 +85,10 @@ def iscsi_set_options( params['error_recovery_level'] = error_recovery_level if allow_duplicated_isid: params['allow_duplicated_isid'] = allow_duplicated_isid + if max_large_datain_per_connection: + params['max_large_datain_per_connection'] = max_large_datain_per_connection + if max_r2t_per_connection: + params['max_r2t_per_connection'] = max_r2t_per_connection return client.call('iscsi_set_options', params) @@ -374,18 +382,63 @@ def iscsi_target_node_add_pg_ig_maps(client, pg_ig_maps, name): return client.call('iscsi_target_node_add_pg_ig_maps', params) +def iscsi_target_node_set_redirect(client, name, pg_tag, redirect_host, redirect_port): + """Update redirect portal of the public portal group for the target node. + + Args: + name: Target node name (ASCII) + pg_tag: Portal group tag (unique, integer > 0) + redirect_host: Numeric IP address to which the target node is redirected + redirect_port: Numeric TCP port to which the target node is redirected + + Returns: + True or False + """ + params = { + 'name': name, + 'pg_tag': pg_tag + } + + if redirect_host: + params['redirect_host'] = redirect_host + if redirect_port: + params['redirect_port'] = redirect_port + return client.call('iscsi_target_node_set_redirect', params) + + +def iscsi_target_node_request_logout(client, name, pg_tag): + """Request connections to the target node to logout. + + Args: + name: Target node name (ASCII) + pg_tag: Portal group tag (unique, integer > 0) (optional) + + Returns: + True or False + """ + params = {'name': name} + + if pg_tag: + params['pg_tag'] = pg_tag + return client.call('iscsi_target_node_request_logout', params) + + @deprecated_alias('add_portal_group') -def iscsi_create_portal_group(client, portals, tag): +def iscsi_create_portal_group(client, portals, tag, private): """Add a portal group. Args: portals: List of portals, e.g. [{'host': ip, 'port': port}] tag: Initiator group tag (unique, integer > 0) + private: Public (false) or private (true) portal group for login redirection. Returns: True or False """ params = {'tag': tag, 'portals': portals} + + if private: + params['private'] = private return client.call('iscsi_create_portal_group', params) diff --git a/scripts/rpc/nvmf.py b/scripts/rpc/nvmf.py index 398544ef958..567ef3c8857 100644 --- a/scripts/rpc/nvmf.py +++ b/scripts/rpc/nvmf.py @@ -107,7 +107,9 @@ def nvmf_create_transport(client, no_srq=False, c2h_success=True, dif_insert_or_strip=None, - sock_priority=None): + sock_priority=None, + acceptor_backlog=None, + abort_timeout_sec=None): """NVMf Transport Create options. Args: @@ -125,6 +127,8 @@ def nvmf_create_transport(client, no_srq: Boolean flag to disable SRQ even for devices that support it - RDMA specific (optional) c2h_success: Boolean flag to disable the C2H success optimization - TCP specific (optional) dif_insert_or_strip: Boolean flag to enable DIF insert/strip for I/O - TCP specific (optional) + acceptor_backlog: Pending connections allowed at one time - RDMA specific (optional) + abort_timeout_sec: Abort execution timeout value, in seconds (optional) Returns: True or False @@ -163,6 +167,10 @@ def nvmf_create_transport(client, params['dif_insert_or_strip'] = dif_insert_or_strip if sock_priority: params['sock_priority'] = sock_priority + if acceptor_backlog is not None: + params['acceptor_backlog'] = acceptor_backlog + if abort_timeout_sec: + params['abort_timeout_sec'] = abort_timeout_sec return client.call('nvmf_create_transport', params) @@ -213,7 +221,8 @@ def nvmf_create_subsystem(client, tgt_name=None, model_number='SPDK bdev Controller', allow_any_host=False, - max_namespaces=0): + max_namespaces=0, + ana_reporting=False): """Construct an NVMe over Fabrics target subsystem. Args: @@ -223,6 +232,8 @@ def nvmf_create_subsystem(client, model_number: Model number of virtual controller. allow_any_host: Allow any host (True) or enforce allowed host whitelist (False). Default: False. max_namespaces: Maximum number of namespaces that can be attached to the subsystem (optional). Default: 0 (Unlimited). + ana_reporting: Enable ANA reporting feature. Default: False. + Returns: True or False @@ -246,6 +257,9 @@ def nvmf_create_subsystem(client, if tgt_name: params['tgt_name'] = tgt_name + if ana_reporting: + params['ana_reporting'] = ana_reporting + return client.call('nvmf_create_subsystem', params) @@ -316,6 +330,46 @@ def nvmf_subsystem_remove_listener( return client.call('nvmf_subsystem_remove_listener', params) +def nvmf_subsystem_listener_set_ana_state( + client, + nqn, + ana_state, + trtype, + traddr, + trsvcid, + adrfam, + tgt_name=None): + """Set ANA state of a listener for an NVMe-oF subsystem. + + Args: + nqn: Subsystem NQN. + ana_state: ANA state to set ("optimized", "non_optimized", or "inaccessible"). + trtype: Transport type ("RDMA"). + traddr: Transport address. + trsvcid: Transport service ID. + tgt_name: name of the parent NVMe-oF target (optional). + adrfam: Address family ("IPv4", "IPv6", "IB", or "FC"). + + Returns: + True or False + """ + listen_address = {'trtype': trtype, + 'traddr': traddr, + 'trsvcid': trsvcid} + + if adrfam: + listen_address['adrfam'] = adrfam + + params = {'nqn': nqn, + 'listen_address': listen_address, + 'ana_state': ana_state} + + if tgt_name: + params['tgt_name'] = tgt_name + + return client.call('nvmf_subsystem_listener_set_ana_state', params) + + def nvmf_subsystem_add_ns(client, nqn, bdev_name, tgt_name=None, ptpl_file=None, nsid=None, nguid=None, eui64=None, uuid=None): """Add a namespace to a subsystem. @@ -455,6 +509,60 @@ def nvmf_delete_subsystem(client, nqn, tgt_name=None): return client.call('nvmf_delete_subsystem', params) +def nvmf_subsystem_get_controllers(client, nqn, tgt_name=None): + """Get list of controllers of an NVMe-oF subsystem. + + Args: + nqn: Subsystem NQN. + tgt_name: name of the parent NVMe-oF target (optional). + + Returns: + List of controller objects of an NVMe-oF subsystem. + """ + params = {'nqn': nqn} + + if tgt_name: + params['tgt_name'] = tgt_name + + return client.call('nvmf_subsystem_get_controllers', params) + + +def nvmf_subsystem_get_qpairs(client, nqn, tgt_name=None): + """Get list of queue pairs of an NVMe-oF subsystem. + + Args: + nqn: Subsystem NQN. + tgt_name: name of the parent NVMe-oF target (optional). + + Returns: + List of queue pair objects of an NVMe-oF subsystem. + """ + params = {'nqn': nqn} + + if tgt_name: + params['tgt_name'] = tgt_name + + return client.call('nvmf_subsystem_get_qpairs', params) + + +def nvmf_subsystem_get_listeners(client, nqn, tgt_name=None): + """Get list of listeners of an NVMe-oF subsystem. + + Args: + nqn: Subsystem NQN. + tgt_name: name of the parent NVMe-oF target (optional). + + Returns: + List of listener objects of an NVMe-oF subsystem. + """ + params = {'nqn': nqn} + + if tgt_name: + params['tgt_name'] = tgt_name + + return client.call('nvmf_subsystem_get_listeners', params) + + def nvmf_get_stats(client, tgt_name=None): """Query NVMf statistics. diff --git a/scripts/rpc/sock.py b/scripts/rpc/sock.py new file mode 100644 index 00000000000..94f9b829398 --- /dev/null +++ b/scripts/rpc/sock.py @@ -0,0 +1,49 @@ +def sock_impl_get_options(client, impl_name=None): + """Get parameters for the socket layer implementation. + + Args: + impl_name: name of socket implementation, e.g. posix + """ + params = {} + + params['impl_name'] = impl_name + + return client.call('sock_impl_get_options', params) + + +def sock_impl_set_options(client, + impl_name=None, + recv_buf_size=None, + send_buf_size=None, + enable_recv_pipe=None, + enable_zerocopy_send=None, + enable_quickack=None, + enable_placement_id=None): + """Set parameters for the socket layer implementation. + + Args: + impl_name: name of socket implementation, e.g. posix + recv_buf_size: size of socket receive buffer in bytes (optional) + send_buf_size: size of socket send buffer in bytes (optional) + enable_recv_pipe: enable or disable receive pipe (optional) + enable_zerocopy_send: enable or disable zerocopy on send (optional) + enable_quickack: enable or disable quickack (optional) + enable_placement_id: enable or disable placement_id (optional) + """ + params = {} + + params['impl_name'] = impl_name + if recv_buf_size is not None: + params['recv_buf_size'] = recv_buf_size + if send_buf_size is not None: + params['send_buf_size'] = send_buf_size + if enable_recv_pipe is not None: + params['enable_recv_pipe'] = enable_recv_pipe + if enable_zerocopy_send is not None: + params['enable_zerocopy_send'] = enable_zerocopy_send + if enable_quickack is not None: + params['enable_quickack'] = enable_quickack + if enable_placement_id is not None: + params['enable_placement_id'] = enable_placement_id + + return client.call('sock_impl_set_options', params) diff --git a/scripts/rxe_cfg_small.sh b/scripts/rxe_cfg_small.sh index 0674efe393c..2f1128569a3 100755 --- a/scripts/rxe_cfg_small.sh +++ b/scripts/rxe_cfg_small.sh @@ -10,6 +10,10 @@ declare -r rdma_rxe_rm=$rdma_rxe/parameters/remove declare -r infiniband=/sys/class/infiniband declare -r net=/sys/class/net +declare -A net_devices +declare -A net_to_rxe +declare -A rxe_to_net + uevent() ( [[ -e $1/uevent ]] || return 0 @@ -70,11 +74,7 @@ start() { } stop() { - local rxe - - for rxe in "$infiniband/rxe"+([0-9]); do - remove_rxe "${rxe##*/}" - done + remove_rxe if ! modprobeq -r rdma_rxe \ || [[ -e $rdma_rxe ]]; then @@ -93,7 +93,7 @@ status() { printf 'rdma_rxe module not loaded\n' >&2 fi - local dev net_devs + local dev local link_map link_map[0]=no @@ -102,21 +102,14 @@ status() { status_header local name link driver speed mtu ip rxe rxe_dev active_mtu - for dev in "$net/"!(bonding_masters); do - (($(< "$dev/type") == 1)) || continue - + for dev in "${net_devices[@]}"; do name="" link="" driver="" speed="" mtu="" ip="" rxe_dev="" active_mtu="" name=${dev##*/} - for rxe in "$infiniband/rxe"+([0-9]); do - if [[ $(< "$rxe/parent") == "$name" ]]; then - rxe_dev=${rxe##*/} - active_mtu=$(get_rxe_mtu "$rxe_dev") - break - fi - done + rxe_dev=${net_to_rxe["$name"]} + active_mtu=$(get_rxe_mtu "$rxe_dev") link=${link_map[$(< "$dev/carrier")]} @@ -206,30 +199,41 @@ add_rxe() { [[ -e $rdma_rxe/parameters ]] || return 1 if [[ -z $1 || $1 == all ]]; then - net_devs=("$net/"!(bonding_masters)) - elif [[ -e $net/$1 ]]; then - net_devs=("$net/$1") + net_devs=("${!net_devices[@]}") + elif [[ -n ${net_to_rxe["$1"]} ]]; then + printf '%s interface already in use (%s)\n' \ + "$1" "${net_to_rxe["$1"]}" + return 0 + elif [[ -n ${net_devices["$1"]} ]]; then + net_devs=("$1") else printf '%s interface does not exist\n' "$1" return 1 fi for dev in "${net_devs[@]}"; do - (($(< "$dev/type") != 1)) && continue - echo "${dev##*/}" > "$rdma_rxe_add" + if [[ -z ${net_to_rxe["$dev"]} ]]; then + echo "${dev##*/}" > "$rdma_rxe_add" + fi + link_up "${dev##*/}" done 2> /dev/null } remove_rxe() { - [[ -e $infiniband/${1##*/} ]] && echo "${1##*/}" > "$rdma_rxe_rm" -} + local rxes rxe -link_up_rxes() { - local rxe parent + rxes=("${!rxe_to_net[@]}") + if [[ -z $1 || $1 == all ]]; then + rxes=("${!rxe_to_net[@]}") + elif [[ -z ${rxe_to_net["$1"]} ]]; then + printf '%s rxe interface does not exist\n' "$1" + return 0 + elif [[ -n ${rxe_to_net["$1"]} ]]; then + rxes=("$1") + fi - for rxe in "$infiniband/rxe"+([0-9]); do - parent=$(< /"$rxe/parent") - link_up "$parent" + for rxe in "${rxes[@]}"; do + echo "$rxe" > "$rdma_rxe_rm" done } @@ -239,6 +243,24 @@ link_up() { echo $(($(< "$net/$1/flags") | 0x1)) > "$net/$1/flags" } +collect_devices() { + local net_dev rxe_dev + + for net_dev in "$net/"!(bonding_masters); do + (($(< "$net_dev/type") != 1)) && continue + net_devices["${net_dev##*/}"]=$net_dev + for rxe_dev in "$infiniband/rxe"+([0-9]); do + if [[ $(< "$rxe_dev/parent") == "${net_dev##*/}" ]]; then + net_to_rxe["${net_dev##*/}"]=${rxe_dev##*/} + rxe_to_net["${rxe_dev##*/}"]=${net_dev##*/} + continue 2 + fi + done + done +} + +collect_devices + case "${1:-status}" in start) start @@ -250,7 +272,7 @@ case "${1:-status}" in add_rxe "${2:-all}" ;; remove) - remove_rxe "$2" + remove_rxe "${2:-all}" ;; status) IFS= read -r match < <( diff --git a/scripts/setup.sh b/scripts/setup.sh index ca271e173aa..8c81ee63e76 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -65,6 +65,9 @@ function usage() { echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" echo " bind devices to the given driver." echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" + echo "PCI_BLOCK_SYNC_ON_RESET" + echo " If set in the environment, the attempt to wait for block devices associated" + echo " with given PCI device will be made upon reset" exit 0 } @@ -88,32 +91,24 @@ function check_for_driver() { function pci_dev_echo() { local bdf="$1" - local vendor - local device - vendor="$(cat /sys/bus/pci/devices/$bdf/vendor)" - device="$(cat /sys/bus/pci/devices/$bdf/device)" shift - echo "$bdf (${vendor#0x} ${device#0x}): $*" + echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" } function linux_bind_driver() { bdf="$1" driver_name="$2" - old_driver_name="no driver" - ven_dev_id=$(lspci -n -s $bdf | cut -d' ' -f3 | sed 's/:/ /') + old_driver_name=${drivers_d["$bdf"]:-no driver} + ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" - if [ -e "/sys/bus/pci/devices/$bdf/driver" ]; then - old_driver_name=$(basename $(readlink /sys/bus/pci/devices/$bdf/driver)) - - if [ "$driver_name" = "$old_driver_name" ]; then - pci_dev_echo "$bdf" "Already using the $old_driver_name driver" - return 0 - fi - - echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true - echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" + if [[ $driver_name == "$old_driver_name" ]]; then + pci_dev_echo "$bdf" "Already using the $old_driver_name driver" + return 0 fi + echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true + echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" + pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true @@ -130,13 +125,12 @@ function linux_bind_driver() { function linux_unbind_driver() { local bdf="$1" local ven_dev_id - ven_dev_id=$(lspci -n -s $bdf | cut -d' ' -f3 | sed 's/:/ /') - local old_driver_name="no driver" + ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" + local old_driver_name=${drivers_d["$bdf"]:-no driver} - if [ -e "/sys/bus/pci/devices/$bdf/driver" ]; then - old_driver_name=$(basename $(readlink /sys/bus/pci/devices/$bdf/driver)) - echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true - echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" + if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then + echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true + echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" fi pci_dev_echo "$bdf" "$old_driver_name -> no driver" @@ -146,37 +140,99 @@ function linux_hugetlbfs_mounts() { mount | grep ' type hugetlbfs ' | awk '{ print $3 }' } -function get_nvme_name_from_bdf() { - local blknames=() +function get_block_dev_from_bdf() { + local bdf=$1 + local block - set +e - nvme_devs=$(lsblk -d --output NAME | grep "^nvme") - set -e - for dev in $nvme_devs; do - link_name=$(readlink /sys/block/$dev/device/device) || true - if [ -z "$link_name" ]; then - link_name=$(readlink /sys/block/$dev/device) - fi - link_bdf=$(basename "$link_name") - if [ "$link_bdf" = "$1" ]; then - blknames+=($dev) + for block in /sys/block/*; do + if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then + echo "${block##*/}" + return 0 fi done - - printf '%s\n' "${blknames[@]}" } -function get_virtio_names_from_bdf() { - blk_devs=$(lsblk --nodeps --output NAME) - virtio_names=() +function get_mounted_part_dev_from_bdf_block() { + local bdf=$1 + local blocks block part - for dev in $blk_devs; do - if readlink "/sys/block/$dev" | grep -q "$1"; then - virtio_names+=("$dev") - fi + blocks=($(get_block_dev_from_bdf "$bdf")) + + for block in "${blocks[@]}"; do + for part in "/sys/block/$block/$block"*; do + [[ -b /dev/${part##*/} ]] || continue + if [[ $(< /proc/self/mountinfo) == *" $(< "$part/dev") "* ]]; then + echo "${part##*/}" + fi + done done +} + +function collect_devices() { + # NVMe, IOAT, IDXD, VIRTIO, VMD + + local ids dev_type dev_id bdf bdfs in_use driver + + ids+="PCI_DEVICE_ID_INTEL_IOAT" + ids+="|PCI_DEVICE_ID_INTEL_IDXD" + ids+="|PCI_DEVICE_ID_VIRTIO" + ids+="|PCI_DEVICE_ID_INTEL_VMD" + ids+="|SPDK_PCI_CLASS_NVME" - eval "$2=( " "${virtio_names[@]}" " )" + local -gA nvme_d ioat_d idxd_d virtio_d vmd_d all_devices_d drivers_d + + while read -r _ dev_type dev_id; do + bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) + [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) + [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) + [[ $dev_type =~ (NVME|IOAT|IDXD|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} + for bdf in "${bdfs[@]}"; do + in_use=0 + if [[ $1 != status ]] && ! pci_can_use "$bdf"; then + pci_dev_echo "$bdf" "Skipping un-whitelisted controller at $bdf" + in_use=1 + fi + if [[ $1 != status ]] && [[ $dev_type == nvme || $dev_type == virtio ]]; then + if ! verify_bdf_mounts "$bdf"; then + in_use=1 + fi + fi + eval "${dev_type}_d[$bdf]=$in_use" + all_devices_d["$bdf"]=$in_use + if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then + driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") + drivers_d["$bdf"]=${driver##*/} + fi + done + done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") +} + +function collect_driver() { + local bdf=$1 + local override_driver=$2 + local drivers driver + + [[ -e /sys/bus/pci/devices/$bdf/modalias ]] || return 1 + if drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then + # Pick first entry in case multiple aliases are bound to a driver. + driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) + driver=${driver##*/} + else + driver=$override_driver + fi 2> /dev/null + echo "$driver" +} + +function verify_bdf_mounts() { + local bdf=$1 + local blknames=($(get_mounted_part_dev_from_bdf_block "$bdf")) + + if ((${#blknames[@]} > 0)); then + for name in "${blknames[@]}"; do + pci_dev_echo "$bdf" "Active mountpoints on /dev/$name, so not binding PCI dev" + done + return 1 + fi } function configure_linux_pci() { @@ -219,112 +275,21 @@ function configure_linux_pci() { modprobe $driver_name fi - # NVMe - for bdf in ${pci_bus_cache["0x010802"]}; do - blknames=() - if ! pci_can_use $bdf; then - pci_dev_echo "$bdf" "Skipping un-whitelisted NVMe controller at $bdf" - continue - fi - - mount=false - for blkname in $(get_nvme_name_from_bdf $bdf); do - mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w) - if [ "$mountpoints" != "0" ]; then - mount=true - blknames+=($blkname) + for bdf in "${!all_devices_d[@]}"; do + if ((all_devices_d["$bdf"] == 0)); then + if [[ -n ${nvme_d["$bdf"]} ]]; then + # Some nvme controllers may take significant amount of time while being + # unbound from the driver. Put that task into background to speed up the + # whole process. Currently this is done only for the devices bound to the + # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being + # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. + linux_bind_driver "$bdf" "$driver_name" & + else + linux_bind_driver "$bdf" "$driver_name" fi - done - - if ! $mount; then - linux_bind_driver "$bdf" "$driver_name" - else - for name in "${blknames[@]}"; do - pci_dev_echo "$bdf" "Active mountpoints on /dev/$name, so not binding PCI dev" - done fi done - - # IOAT - TMP=$(mktemp) - #collect all the device_id info of ioat devices. - grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \ - | awk -F"x" '{print $2}' > $TMP - - while IFS= read -r dev_id; do - for bdf in ${pci_bus_cache["0x8086:0x$dev_id"]}; do - if ! pci_can_use $bdf; then - pci_dev_echo "$bdf" "Skipping un-whitelisted I/OAT device" - continue - fi - - linux_bind_driver "$bdf" "$driver_name" - done - done < $TMP - rm $TMP - - # IDXD - TMP=$(mktemp) - #collect all the device_id info of idxd devices. - grep "PCI_DEVICE_ID_INTEL_IDXD" $rootdir/include/spdk/pci_ids.h \ - | awk -F"x" '{print $2}' > $TMP - - while IFS= read -r dev_id; do - for bdf in ${pci_bus_cache["0x8086:0x$dev_id"]}; do - if ! pci_can_use $bdf; then - pci_dev_echo "$bdf" "Skipping un-whitelisted IDXD device" - continue - fi - - linux_bind_driver "$bdf" "$driver_name" - done - done < $TMP - rm $TMP - - # virtio - TMP=$(mktemp) - #collect all the device_id info of virtio devices. - grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \ - | awk -F"x" '{print $2}' > $TMP - - while IFS= read -r dev_id; do - for bdf in ${pci_bus_cache["0x1af4:0x$dev_id"]}; do - if ! pci_can_use $bdf; then - pci_dev_echo "$bdf" "Skipping un-whitelisted Virtio device at $bdf" - continue - fi - blknames=() - get_virtio_names_from_bdf "$bdf" blknames - for blkname in "${blknames[@]}"; do - if [ "$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w)" != "0" ]; then - pci_dev_echo "$bdf" "Active mountpoints on /dev/$blkname, so not binding" - continue 2 - fi - done - - linux_bind_driver "$bdf" "$driver_name" - done - done < $TMP - rm $TMP - - # VMD - TMP=$(mktemp) - #collect all the device_id info of vmd devices. - grep "PCI_DEVICE_ID_INTEL_VMD" $rootdir/include/spdk/pci_ids.h \ - | awk -F"x" '{print $2}' > $TMP - - while IFS= read -r dev_id; do - for bdf in ${pci_bus_cache["0x8086:0x$dev_id"]}; do - if [[ -z "$PCI_WHITELIST" ]] || ! pci_can_use $bdf; then - echo "Skipping un-whitelisted VMD device at $bdf" - continue - fi - - linux_bind_driver "$bdf" "$driver_name" - echo " VMD generic kdrv: " "$bdf" "$driver_name" - done - done < $TMP - rm $TMP + wait echo "1" > "/sys/bus/pci/rescan" } @@ -447,118 +412,28 @@ function configure_linux() { } function reset_linux_pci() { - # NVMe - set +e - check_for_driver nvme - driver_loaded=$? - set -e - for bdf in ${pci_bus_cache["0x010802"]}; do - if ! pci_can_use $bdf; then - pci_dev_echo "$bdf" "Skipping un-whitelisted NVMe controller $blkname" - continue - fi - if [ $driver_loaded -ne 0 ]; then - linux_bind_driver "$bdf" nvme - else - linux_unbind_driver "$bdf" - fi - done - - # IOAT - TMP=$(mktemp) - #collect all the device_id info of ioat devices. - grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \ - | awk -F"x" '{print $2}' > $TMP - - set +e - check_for_driver ioatdma - driver_loaded=$? - set -e - while IFS= read -r dev_id; do - for bdf in ${pci_bus_cache["0x8086:0x$dev_id"]}; do - if ! pci_can_use $bdf; then - pci_dev_echo "$bdf" "Skipping un-whitelisted I/OAT device" - continue - fi - if [ $driver_loaded -ne 0 ]; then - linux_bind_driver "$bdf" ioatdma - else - linux_unbind_driver "$bdf" - fi - done - done < $TMP - rm $TMP - - # IDXD - TMP=$(mktemp) - #collect all the device_id info of idxd devices. - grep "PCI_DEVICE_ID_INTEL_IDXD" $rootdir/include/spdk/pci_ids.h \ - | awk -F"x" '{print $2}' > $TMP - set +e - check_for_driver idxd - driver_loaded=$? - set -e - while IFS= read -r dev_id; do - for bdf in ${pci_bus_cache["0x8086:0x$dev_id"]}; do - if ! pci_can_use $bdf; then - pci_dev_echo "$bdf" "Skipping un-whitelisted IDXD device" - continue - fi - if [ $driver_loaded -ne 0 ]; then - linux_bind_driver "$bdf" idxd - else - linux_unbind_driver "$bdf" - fi - done - done < $TMP - rm $TMP - # virtio - TMP=$(mktemp) - #collect all the device_id info of virtio devices. - grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \ - | awk -F"x" '{print $2}' > $TMP - # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded # Requires some more investigation - for example, some kernels do not seem to have # virtio-pci but just virtio_scsi instead. Also need to make sure we get the # underscore vs. dash right in the virtio_scsi name. modprobe virtio-pci || true - while IFS= read -r dev_id; do - for bdf in ${pci_bus_cache["0x1af4:0x$dev_id"]}; do - if ! pci_can_use $bdf; then - pci_dev_echo "$bdf" "Skipping un-whitelisted Virtio device at" - continue - fi - linux_bind_driver "$bdf" virtio-pci - done - done < $TMP - rm $TMP - - # VMD - TMP=$(mktemp) - #collect all the device_id info of vmd devices. - grep "PCI_DEVICE_ID_INTEL_VMD" $rootdir/include/spdk/pci_ids.h \ - | awk -F"x" '{print $2}' > $TMP - - set +e - check_for_driver vmd - driver_loaded=$? - set -e - while IFS= read -r dev_id; do - for bdf in ${pci_bus_cache["0x8086:0x$dev_id"]}; do - if ! pci_can_use $bdf; then - echo "Skipping un-whitelisted VMD device at $bdf" - continue - fi - if [ $driver_loaded -ne 0 ]; then - linux_bind_driver "$bdf" vmd - else - linux_unbind_driver "$bdf" - fi - done - done < $TMP - rm $TMP + for bdf in "${!all_devices_d[@]}"; do + ((all_devices_d["$bdf"] == 0)) || continue + + [[ -n ${nvme_d["$bdf"]} ]] && fallback_driver=nvme + [[ -n ${ioat_d["$bdf"]} ]] && fallback_driver=ioatdma + [[ -n ${idxd_d["$bdf"]} ]] && fallback_driver=idxd + [[ -n ${virtio_d["$bdf"]} ]] && fallback_driver=virtio-pci + [[ -n ${vmd_d["$bdf"]} ]] && fallback_driver=vmd + driver=$(collect_driver "$bdf" "$fallback_driver") + + if ! check_for_driver "$driver"; then + linux_bind_driver "$bdf" "$driver" + else + linux_unbind_driver "$bdf" + fi + done echo "1" > "/sys/bus/pci/rescan" } @@ -601,12 +476,11 @@ function status_linux() { printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages fi - echo "" + echo -e "\nBDF\t\tVendor\tDevice\tNUMA\tDriver\t\tDevice name\n" echo "NVMe devices" - echo -e "BDF\t\tVendor\tDevice\tNUMA\tDriver\t\tDevice name" - for bdf in ${pci_bus_cache["0x010802"]}; do - driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}') + for bdf in "${!nvme_d[@]}"; do + driver=${drivers_d["$bdf"]} if [ "$numa_nodes" = "0" ]; then node="-" else @@ -615,141 +489,92 @@ function status_linux() { node=unknown fi fi - device=$(cat /sys/bus/pci/devices/$bdf/device) - vendor=$(cat /sys/bus/pci/devices/$bdf/vendor) if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then name="\t"$(ls /sys/bus/pci/devices/$bdf/nvme) else name="-" fi - echo -e "$bdf\t${vendor#0x}\t${device#0x}\t$node\t${driver:--}\t\t$name" + echo -e "$bdf\t${pci_ids_vendor["$bdf"]#0x}\t${pci_ids_device["$bdf"]#0x}\t$node\t${driver:--}\t\t$name" done echo "" echo "I/OAT Engine" - #collect all the device_id info of ioat devices. - TMP=$(grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \ - | awk -F"x" '{print $2}') - echo -e "BDF\t\tVendor\tDevice\tNUMA\tDriver" - for dev_id in $TMP; do - for bdf in ${pci_bus_cache["0x8086:0x$dev_id"]}; do - driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}') - if [ "$numa_nodes" = "0" ]; then - node="-" - else - node=$(cat /sys/bus/pci/devices/$bdf/numa_node) - if ((node == -1)); then - node=unknown - fi + for bdf in "${!ioat_d[@]}"; do + driver=${drivers_d["$bdf"]} + if [ "$numa_nodes" = "0" ]; then + node="-" + else + node=$(cat /sys/bus/pci/devices/$bdf/numa_node) + if ((node == -1)); then + node=unknown fi - device=$(cat /sys/bus/pci/devices/$bdf/device) - vendor=$(cat /sys/bus/pci/devices/$bdf/vendor) - echo -e "$bdf\t${vendor#0x}\t${device#0x}\t$node\t${driver:--}" - done + fi + echo -e "$bdf\t${pci_ids_vendor["$bdf"]#0x}\t${pci_ids_device["$bdf"]#0x}\t$node\t${driver:--}" done echo "" echo "IDXD Engine" - #collect all the device_id info of idxd devices. - TMP=$(grep "PCI_DEVICE_ID_INTEL_IDXD" $rootdir/include/spdk/pci_ids.h \ - | awk -F"x" '{print $2}') - echo -e "BDF\t\tVendor\tDevice\tNUMA\tDriver" - for dev_id in $TMP; do - for bdf in ${pci_bus_cache["0x8086:0x$dev_id"]}; do - driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}') - if [ "$numa_nodes" = "0" ]; then - node="-" - else - node=$(cat /sys/bus/pci/devices/$bdf/numa_node) - fi - device=$(cat /sys/bus/pci/devices/$bdf/device) - vendor=$(cat /sys/bus/pci/devices/$bdf/vendor) - echo -e "$bdf\t${vendor#0x}\t${device#0x}\t$node\t${driver:--}" - done + for bdf in "${!idxd_d[@]}"; do + driver=${drivers_d["$bdf"]} + if [ "$numa_nodes" = "0" ]; then + node="-" + else + node=$(cat /sys/bus/pci/devices/$bdf/numa_node) + fi + echo -e "$bdf\t${pci_ids_vendor["$bdf"]#0x}\t${pci_ids_device["$bdf"]#0x}\t$node\t${driver:--}" done echo "" echo "virtio" - #collect all the device_id info of virtio devices. - TMP=$(grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \ - | awk -F"x" '{print $2}') - echo -e "BDF\t\tVendor\tDevice\tNUMA\tDriver\t\tDevice name" - for dev_id in $TMP; do - for bdf in ${pci_bus_cache["0x1af4:0x$dev_id"]}; do - driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}') - if [ "$numa_nodes" = "0" ]; then - node="-" - else - node=$(cat /sys/bus/pci/devices/$bdf/numa_node) - if ((node == -1)); then - node=unknown - fi + for bdf in "${!virtio_d[@]}"; do + driver=${drivers_d["$bdf"]} + if [ "$numa_nodes" = "0" ]; then + node="-" + else + node=$(cat /sys/bus/pci/devices/$bdf/numa_node) + if ((node == -1)); then + node=unknown fi - device=$(cat /sys/bus/pci/devices/$bdf/device) - vendor=$(cat /sys/bus/pci/devices/$bdf/vendor) - blknames=() - get_virtio_names_from_bdf "$bdf" blknames - echo -e "$bdf\t${vendor#0x}\t${device#0x}\t$node\t\t${driver:--}\t\t" "${blknames[@]}" - done + fi + blknames=($(get_mounted_part_dev_from_bdf_block "$bdf")) + echo -e "$bdf\t${pci_ids_vendor["$bdf"]#0x}\t${pci_ids_device["$bdf"]#0x}\t$node\t\t${driver:--}\t\t" "${blknames[@]}" done echo "" echo "VMD" - #collect all the device_id info of vmd devices. - TMP=$(grep "PCI_DEVICE_ID_INTEL_VMD" $rootdir/include/spdk/pci_ids.h \ - | awk -F"x" '{print $2}') - echo -e "BDF\t\tNuma Node\tDriver Name" - for dev_id in $TMP; do - for bdf in ${pci_bus_cache["0x8086:0x$dev_id"]}; do - driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}') - node=$(cat /sys/bus/pci/devices/$bdf/numa_node) - if ((node == -1)); then - node=unknown - fi - echo -e "$bdf\t$node\t\t$driver" - done + for bdf in "${!vmd_d[@]}"; do + driver=${drivers_d["$bdf"]} + node=$(cat /sys/bus/pci/devices/$bdf/numa_node) + if ((node == -1)); then + node=unknown + fi + echo -e "$bdf\t$node\t\t$driver" done } function status_freebsd() { - local id pci - local ioat idxd vmd + local pci status_print() ( local dev driver echo -e "BDF\t\tVendor\tDevice\tDriver" - for id; do - for pci in ${pci_bus_cache["$id"]}; do - driver=$(pciconf -l "pci$pci") - driver=${driver%@*} - printf '%s\t%s\t%s\t%s\n' \ - "$pci" \ - "${pci_ids_vendor["$pci"]}" \ - "${pci_ids_device["$pci"]}" \ - "$driver" - done + for pci; do + driver=$(pciconf -l "pci$pci") + driver=${driver%@*} + printf '%s\t%s\t%s\t%s\n' \ + "$pci" \ + "${pci_ids_vendor["$pci"]}" \ + "${pci_ids_device["$pci"]}" \ + "$driver" done ) - devs=PCI_DEVICE_ID_INTEL_IOAT - devs+="|PCI_DEVICE_ID_INTEL_IDXD" - devs+="|PCI_DEVICE_ID_INTEL_VMD" - - local dev_type dev_id - while read -r _ dev_type dev_id; do - case "$dev_type" in - *IOAT*) ioat+=("0x8086:$dev_id") ;; - *IDXD*) idxd+=("0x8086:$dev_id") ;; - *VMD*) vmd+=("0x8086:$dev_id") ;; - esac - done < <(grep -E "$devs" "$rootdir/include/spdk/pci_ids.h") - local contigmem=present if ! kldstat -q -m contigmem; then contigmem="not present" @@ -761,37 +586,26 @@ function status_freebsd() { Num Buffers: $(kenv hw.contigmem.num_buffers) NVMe devices - $(status_print 0x010802) + $(status_print "${!nvme_d[@]}") I/IOAT DMA - $(status_print "${ioat[@]}") + $(status_print "${!ioat_d[@]}") IDXD DMA - $(status_print "${idxd[@]}") + $(status_print "${!idxd_d[@]}") VMD - $(status_print "${vmd[@]}") + $(status_print "${!vmd_d[@]}") BSD_INFO } function configure_freebsd_pci() { - local devs ids id local BDFS - devs=PCI_DEVICE_ID_INTEL_IOAT - devs+="|PCI_DEVICE_ID_INTEL_IDXD" - devs+="|PCI_DEVICE_ID_INTEL_VMD" - - ids=($(grep -E "$devs" "$rootdir/include/spdk/pci_ids.h" | awk '{print $3}')) - - if [[ -n ${pci_bus_cache["0x010802"]} ]]; then - BDFS+=(${pci_bus_cache["0x010802"]}) - fi - - for id in "${ids[@]}"; do - [[ -n ${pci_bus_cache["0x8086:$id"]} ]] || continue - BDFS+=(${pci_bus_cache["0x8086:$id"]}) - done + BDFS+=("${!nvme_d[@]}") + BDFS+=("${!ioat_d[@]}") + BDFS+=("${!idxd_d[@]}") + BDFS+=("${!vmd_d[@]}") # Drop the domain part from all the addresses BDFS=("${BDFS[@]#*:}") @@ -850,6 +664,33 @@ if [ -z "$TARGET_USER" ]; then fi fi +collect_devices "$mode" + +if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then + # Note that this will wait only for the first block device attached to + # a given storage controller. For nvme this may miss some of the devs + # in case multiple namespaces are being in place. + # FIXME: Wait for nvme controller(s) to be in live state and determine + # number of configured namespaces, build list of potential block devs + # and pass them to sync_dev_uevents. Is it worth the effort? + bdfs_to_wait_for=() + for bdf in "${!all_devices_d[@]}"; do + ((all_devices_d["$bdf"] == 0)) || continue + if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then + [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue + bdfs_to_wait_for+=("$bdf") + fi + done + if ((${#bdfs_to_wait_for[@]} > 0)); then + echo "Waiting for block devices as requested" + export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci + "$rootdir/scripts/sync_dev_uevents.sh" \ + block/disk \ + "${bdfs_to_wait_for[@]}" & + sync_pid=$! + fi +fi + if [[ $os == Linux ]]; then HUGEPGSZ=$(($(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9'))) HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) @@ -883,3 +724,7 @@ else usage $0 "Invalid argument '$mode'" fi fi + +if [[ -e /proc/$sync_pid/status ]]; then + wait "$sync_pid" +fi diff --git a/scripts/spdkcli.py b/scripts/spdkcli.py index bc85649e0d9..3d7c63baa8e 100755 --- a/scripts/spdkcli.py +++ b/scripts/spdkcli.py @@ -71,12 +71,13 @@ def main(): spdk_shell.con.display("SPDK CLI v0.1") spdk_shell.con.display("") - try: - spdk_shell.run_interactive() - except (JSONRPCException, ExecutionError) as e: - spdk_shell.log.error("%s" % e) - except BrokenPipeError as e: - spdk_shell.log.error("Lost connection with SPDK: %s" % e) + while not spdk_shell._exit: + try: + spdk_shell.run_interactive() + except (JSONRPCException, ExecutionError) as e: + spdk_shell.log.error("%s" % e) + except BrokenPipeError as e: + spdk_shell.log.error("Lost connection with SPDK: %s" % e) if __name__ == "__main__": diff --git a/scripts/spdkcli/ui_node_iscsi.py b/scripts/spdkcli/ui_node_iscsi.py index 938cb7ab495..6852670b3d7 100644 --- a/scripts/spdkcli/ui_node_iscsi.py +++ b/scripts/spdkcli/ui_node_iscsi.py @@ -315,7 +315,7 @@ def ui_command_create(self, tag, portal_list): if cpumask: print("WARNING: Specifying a CPU mask for portal groups is no longer supported. Ignoring.") tag = self.ui_eval_param(tag, "number", None) - self.get_root().construct_portal_group(tag=tag, portals=portals) + self.get_root().construct_portal_group(tag=tag, portals=portals, private=None) def ui_command_delete(self, tag): """Delete a portal group with given tag (unique, integer > 0))""" diff --git a/scripts/sync_dev_uevents.sh b/scripts/sync_dev_uevents.sh new file mode 100755 index 00000000000..b7b2f76988d --- /dev/null +++ b/scripts/sync_dev_uevents.sh @@ -0,0 +1,169 @@ +#!/usr/bin/env bash +shopt -s extglob + +exec {err}>&2 + +help() { + cat <<- HELP + ${0##*/}: subsystem dev [..devN] + + Env: + UEVENT_TIMEOUT - how long to wait for sync - ${UEVENT_TIMEOUT:-10}s + UEVENT_ACTION - uevent action to match on - ${UEVENT_ACTION:-add} + DEVPATH_LOOKUP - check if given dev matches inside DEVPATH + DEVPATH_SUBSYSTEM - subsystem given dev should match in DEVPATH + HELP +} + +get_uevent_attr() ( + source "$1" + + [[ -v $2 ]] && echo "${!2}" +) + +filter_devs() { + local dev p_dev + local maj min type sub + + for dev in "${!devs[@]}"; do + [[ -e /dev/${devs[dev]} ]] || continue + [[ -c /dev/${devs[dev]} ]] && type=char + [[ -b /dev/${devs[dev]} ]] && type=block + maj=$((0x$(stat --printf="%t" "/dev/${devs[dev]}"))) + min=$((0x$(stat --printf="%T" "/dev/${devs[dev]}"))) + + p_dev=/sys/dev/$type/$maj:$min + if [[ -e $p_dev ]]; then + printf '/dev/%s\n' "${devs[dev]}" + + type=$(get_uevent_attr "$p_dev/uevent" DEVTYPE) + sub=$(readlink -f "$p_dev/subsystem") sub=${sub##*/} + if [[ $sub != "${subsystem%%/*}" ]]; then + printf ' wrong subsystem specified (%s != %s)\n' \ + "${subsystem%%/*}" "$sub" + fi >&2 + + if [[ ${subsystem##*/} != "$subsystem" && -n $type ]]; then + if [[ ${subsystem##*/} != "$type" ]]; then + printf ' wrong devtype specified (%s != %s)\n' \ + "${subsystem##*/}" "$type" + fi + fi >&2 + + unset -v "devs[dev]" + fi + done +} + +look_in_devpath() { + local find=$1 + local path=$2 + local sub + + [[ -v DEVPATH_LOOKUP ]] || return 1 + + if [[ -z $path ]]; then + return 1 + fi + + if [[ -e $path/subsystem ]]; then + sub=$(readlink -f "$path/subsystem") + sub=${sub##*/} + fi + + if [[ ${path##*/} == "$find" ]]; then + if [[ -n $DEVPATH_SUBSYSTEM ]]; then + [[ $DEVPATH_SUBSYSTEM == "$sub" ]] || return 1 + fi + return 0 + fi + look_in_devpath "$find" "${path%/*}" +} + +if (($# < 2)); then + help + exit 1 +fi + +subsystem=$1 devs=("${@:2}") +timeout=${UEVENT_TIMEOUT:-10} +action=${UEVENT_ACTION:-add} + +devs=("${devs[@]#/dev/}") +[[ $action == add ]] && filter_devs + +((${#devs[@]})) || exit 0 + +if [[ -S /run/udev/control ]]; then + # systemd-udevd realm + + # If devtmpfs is in place then all, e.g., block subsystem devices are going to + # be handled directly by the kernel. Otherwise, link to udev events in case we + # have some old udevd on board which is meant to mknod them instead. + if [[ $(< /proc/mounts) == *"/dev devtmpfs"* ]]; then + events+=(--kernel) + else + events+=(--udev) + fi + + if [[ $subsystem != all ]]; then + events+=("--subsystem-match=$subsystem") + fi + + # This trap targets a subshell which forks udevadm monitor. Since + # process substitution works in an async fashion, $$ won't wait + # for it, leaving it's child unattended after the main loop breaks + # (udevadm won't exit on its own either). + trap '[[ -e /proc/$!/status ]] && pkill -P $!' EXIT + # Also, this will block while reading through a pipe with a timeout + # after not receiving any input. stdbuf is used since udevadm always + # line buffers the monitor output. + while ((${#devs[@]} > 0)) && IFS="=" read -t"$timeout" -r k v; do + if [[ $k == ACTION && $v == "$action" ]]; then + look_for_devname=1 + continue + fi + if ((look_for_devname == 1)); then + for dev in "${!devs[@]}"; do + # Explicitly allow globbing of the rhs to allow more open matching. + # shellcheck disable=SC2053 + if [[ ${v#/dev/} == ${devs[dev]} || ${v##*/} == ${devs[dev]##*/} ]] \ + || look_in_devpath "${devs[dev]}" "/sys/$v"; then + unset -v "devs[dev]" + look_for_devname=0 + fi + done + fi + done < <(stdbuf --output=0 udevadm monitor --property "${events[@]}") + if ((${#devs[@]} > 0)); then + printf '* Events for some %s devices (%s) were not caught, they may be missing\n' \ + "$subsystem" "${devs[*]}" + fi >&"$err" + exit 0 +elif [[ -e /sys/kernel/uevent_helper ]]; then + # Check if someones uses mdev to serialize uevents. If yes, simply check + # if they are in sync, no need to lookup specific devices in this case. + # If not, fall through to plain sleep. + # To quote some wisdom from gentoo: + # "Even the craziest scenario deserves a fair chance". + + helper=$(< /sys/kernel/uevent_helper) + if [[ ${helper##*/} == mdev && -e /dev/mdev.seq ]]; then + # mdev keeps count of the seqnums on its own on each execution + # and saves the count under /dev/mdev.seq. This is then set to + # + 1 after the uevents finally settled. + while ((timeout--)); do + if (($(< /sys/kernel/uevent_seqnum) + 1 != $(< /dev/mdev.seq))); then + sleep 1s + fi + done + if ((timeout < 0)); then + printf '* Events not synced in time, %s devices (%s) may be missing\n' \ + "$subsystem" "${devs[*]}" + fi + exit 0 + fi >&"$err" +fi 2> /dev/null + +# Fallback, sleep and hope for the best +sleep "${timeout}s" diff --git a/scripts/vagrant/README.md b/scripts/vagrant/README.md index 323a26d2d0b..ff77bfe4781 100644 --- a/scripts/vagrant/README.md +++ b/scripts/vagrant/README.md @@ -184,7 +184,8 @@ Following VM initialization you must: ``` $ sudo scripts/setup.sh - $ sudo ./build/examples/hello_bdev + $ sudo scripts/gen_nvme.sh --json-with-subsystems > ./build/examples/hello_bdev.json + $ sudo ./build/examples/hello_bdev --json ./build/examples/hello_bdev.json -b Nvme0n1 ``` ### Running autorun.sh with vagrant diff --git a/scripts/vagrant/Vagrantfile b/scripts/vagrant/Vagrantfile index 92d0f2985b2..f868e9d015c 100644 --- a/scripts/vagrant/Vagrantfile +++ b/scripts/vagrant/Vagrantfile @@ -2,146 +2,216 @@ # vi: set ft=ruby : require 'open3' -def checkboxtype(distro) +def get_box_type(distro) + spdk_distro = 'spdk/' + distro localboxes, stderr, status = Open3.capture3("vagrant box list") - if localboxes.include? "spdk/"+distro - return "spdk/"+distro - else - case distro - when "centos7" - return "centos/7" - when "centos8" - return "centos/8" - when "ubuntu1604" - return "peru/ubuntu-16.04-server-amd64" - when "ubuntu1804" - return "peru/ubuntu-18.04-server-amd64" - when "fedora30" - return "generic/fedora30" - when "fedora31" - return "generic/fedora31" - when "arch" - return "generic/arch" - when "freebsd11" - return "generic/freebsd11" - when "freebsd12" - return "generic/freebsd12" - when "clearlinux" - return "AntonioMeireles/ClearLinux" - else - "Invalid argument #{distro}" - abort("Invalid argument!") - end - end -end + return spdk_distro if localboxes.include?(spdk_distro) -Vagrant.configure(2) do |config| + distro_to_type = { + 'centos7' => 'centos/7', + 'centos8' => 'centos/8', + 'ubuntu1604' => 'peru/ubuntu-16.04-server-amd64', + 'ubuntu1804' => 'peru/ubuntu-18.04-server-amd64', + 'fedora30' => 'generic/fedora30', + 'fedora31' => 'generic/fedora31', + 'fedora32' => 'generic/fedora32', + 'arch' => 'generic/arch', + 'freebsd11' => 'generic/freebsd11', + 'freebsd12' => 'generic/freebsd12', + 'clearlinux' => 'AntonioMeireles/ClearLinux' + } + abort("Invalid argument! #{distro}") unless distro_to_type.key?(distro) - # Pick the right distro and bootstrap, default is fedora30 - distro = ( ENV['SPDK_VAGRANT_DISTRO'] || "fedora30") - provider = (ENV['SPDK_VAGRANT_PROVIDER'] || "virtualbox") + return distro_to_type[distro] +end - # generic/freebsd boxes do not work properly with vagrant-proxyconf and - # have issues installing rsync and sshfs for syncing files. NFS is - # pre-installed, so use it. - # generic/fedora boxes on the other hand have problems running NFS - # service so use sshfs+rsync combo instead. - plugins_sync_backend = {type: :sshfs} - files_sync_backend = {type: "rsync", rsync__auto: false} - if distro.include? "freebsd" - plugins_sync_backend = {type: :nfs, nfs_udp: false} - files_sync_backend = {type: :nfs, nfs_udp: false, mount_options: ['ro']} - end - config.vm.box = checkboxtype(distro) - config.vm.box_check_update = false - config.vm.synced_folder '.', '/vagrant', disabled: true +def setup_proxy(config,distro) + return unless ENV['http_proxy'] - # Copy in the .gitconfig if it exists - if File.file?(File.expand_path("~/.gitconfig")) - config.vm.provision "file", source: "~/.gitconfig", destination: ".gitconfig" + if Vagrant.has_plugin?("vagrant-proxyconf") + config.proxy.http = ENV['http_proxy'] + config.proxy.https = ENV['https_proxy'] + config.proxy.no_proxy = "localhost,127.0.0.1" end - # Copy the tsocks configuration file for use when installing some spdk test pool dependencies - if File.file?("/etc/tsocks.conf") - $tsocks_copy = <<-SCRIPT + # Proxyconf does not seem to support FreeBSD boxes or at least it's + # docs do not mention that. Set up proxy configuration manually. + if distro.include?("freebsd") + $freebsd_proxy = <<-SCRIPT sudo -s - mv -f tsocks.conf /etc/tsocks.conf - chown root /etc/tsocks.conf - chmod 644 /etc/tsocks.conf + echo "export http_proxy=#{ENV['http_proxy']}" >> /etc/profile + echo "export https_proxy=#{ENV['http_proxy']}" >> /etc/profile + echo "pkg_env: {http_proxy: #{ENV['http_proxy']}}" > /usr/local/etc/pkg.conf + chown root:wheel /usr/local/etc/pkg.conf + chmod 644 /usr/local/etc/pkg.conf SCRIPT - config.vm.provision "file", source: "/etc/tsocks.conf", destination: "tsocks.conf" - config.vm.provision "shell", inline: $tsocks_copy + config.vm.provision "shell", inline: $freebsd_proxy end +end - # vagrant-cachier caches apt/yum etc to speed subsequent - # vagrant up - # to enable, run - # vagrant plugin install vagrant-cachier - # - if Vagrant.has_plugin?("vagrant-cachier") - config.cache.scope = :box - config.cache.synced_folder_opts = plugins_sync_backend +def copy_gitconfig(config) + src_path = '~/.gitconfig' + return unless File.file?(File.expand_path(src_path)) + + config.vm.provision "file", source: src_path, destination: ".gitconfig" +end + +def copy_tsocks(config) + tsocks_file = 'tsocks.conf' + tsocks_file_path = '/etc/' + tsocks_file + + return unless File.file?(tsocks_file_path) + + $tsocks_copy_cmd = <<-SCRIPT + sudo -s + mv -f "#{tsocks_file}" "#{tsocks_file_path}" + chown root "#{tsocks_file_path}" + chmod 644 "#{tsocks_file_path}" + SCRIPT + + config.vm.provision "file", source: tsocks_file_path, destination: tsocks_file + config.vm.provision "shell", inline: $tsocks_copy_cmd +end + +def copy_vagrant_tools(config,files_sync_backend) + src_path = '~/vagrant_tools' + return unless File.directory?(File.expand_path(src_path)) + + config.vm.synced_folder src_path, "/home/vagrant/tools", files_sync_backend +end + +def copy_spdk_dir(config, files_sync_backend) + return unless ENV['COPY_SPDK_DIR'] == "1" + return unless ENV['SPDK_DIR'] + + config.vm.synced_folder ENV['SPDK_DIR'], '/home/vagrant/spdk_repo/spdk', files_sync_backend +end + +def copy_spdk_artifacts(config, plugins_sync_backend) + return unless ENV['COPY_SPDK_ARTIFACTS'] == "1" + + vagrantfile_dir=(ENV['VAGRANTFILE_DIR'] || "none") + config.vm.synced_folder "#{vagrantfile_dir}/output", "/home/vagrant/spdk_repo/output", plugins_sync_backend +end + +def make_spdk_local_copy_of_nfs(config,distro) + user_group = 'vagrant:vagrant' + if distro.include? 'clearlinux' + user_group = 'clear:clear' end - # use http proxy if avaiable - if ENV['http_proxy'] - if Vagrant.has_plugin?("vagrant-proxyconf") - config.proxy.http = ENV['http_proxy'] - config.proxy.https = ENV['https_proxy'] - config.proxy.no_proxy = "localhost,127.0.0.1" - end + spdk_path = '/home/vagrant/spdk_repo/spdk' + spdk_tmp_path = '/tmp/spdk' + $spdk_repo_cmd = <<-SCRIPT + sudo -s + cp -R '#{spdk_path}' '#{spdk_tmp_path}' + umount '#{spdk_path}' && rm -rf '#{spdk_path}' + mv '#{spdk_tmp_path}' '#{spdk_path}' + chown -R #{user_group} '#{spdk_path}' + SCRIPT - # Proxyconf does not seem to support FreeBSD boxes or at least it's - # docs do not mention that. Set up proxy configuration manually. - if distro.include?("freebsd") - $freebsd_proxy = <<-SCRIPT - sudo -s - echo "export http_proxy=#{ENV['http_proxy']}" >> /etc/profile - echo "export https_proxy=#{ENV['http_proxy']}" >> /etc/profile - echo "pkg_env: {http_proxy: #{ENV['http_proxy']}}" > /usr/local/etc/pkg.conf - chown root:wheel /usr/local/etc/pkg.conf - chmod 644 /usr/local/etc/pkg.conf - SCRIPT - config.vm.provision "shell", inline: $freebsd_proxy - end + config.vm.provision "shell", inline: $spdk_repo_cmd +end + +def clear_cflags(config) + $clearcflags_cmd = <<-SCRIPT + echo "export CFLAGS=" >> /etc/profile.d/clearcflags.sh + echo "export CFFLAGS=" >> /etc/profile.d/clearcflags.sh + echo "export CXXFLAGS=" >> /etc/profile.d/clearcflags.sh + echo "export FFLAGS=" >> /etc/profile.d/clearcflags.sh + echo "export THEANO_FLAGS=" >> /etc/profile.d/clearcflags.sh + SCRIPT + config.vm.provision "shell", inline: $clearcflags_cmd, run: "always" +end + +def get_nvme_disk(disk, index) + if ENV['NVME_FILE'] + nvme_file = ENV['NVME_FILE'].split(',') + nvme_disk = nvme_file[index] + else + nvme_disk = '/var/lib/libvirt/images/nvme_disk.img' end - # freebsd boxes in order to have spdk sources synced from host properly - # will use NFS with "ro" option enabled to prevent changes on host filesystem. - # To make sources usable in the guest VM we need to unmount them and use - # local copy. - if distro.include?("freebsd") - $freebsd_spdk_repo = <<-SCRIPT - sudo -s - cp -R /home/vagrant/spdk_repo/spdk /tmp/spdk - umount /home/vagrant/spdk_repo/spdk && rm -rf /home/vagrant/spdk_repo/spdk - mv /tmp/spdk /home/vagrant/spdk_repo/spdk - chown -R vagrant:vagrant /home/vagrant/spdk_repo/spdk - SCRIPT - config.vm.provision "shell", inline: $freebsd_spdk_repo + unless File.exist? (nvme_disk) + puts 'If run with libvirt provider please execute create_nvme_img.sh' end - vmcpu=(ENV['SPDK_VAGRANT_VMCPU'] || 2) - vmram=(ENV['SPDK_VAGRANT_VMRAM'] || 4096) - spdk_dir=(ENV['SPDK_DIR'] || "none") - vmemulator=(ENV['SPDK_QEMU_EMULATOR'] || "") - emulated_nvme_types=(ENV['NVME_DISKS_TYPE'] || "nvme").split(',') + return nvme_disk +end + +def setup_nvme_disk(libvirt, disk, index) + nvme_disk_id = disk + '-' + index.to_s + nvme_disk = get_nvme_disk(disk, index) + nvme_namespaces=(ENV['NVME_DISKS_NAMESPACES'] || "").split(',') - nvme_file=(ENV['NVME_FILE'] || "").split(',') nvme_cmbs=(ENV['NVME_CMB'] || "").split(',') - vagrantfile_dir=(ENV['VAGRANTFILE_DIR'] || "none") + libvirt.qemuargs :value => "-drive" + libvirt.qemuargs :value => "format=raw,file=#{nvme_disk},if=none,id=#{nvme_disk_id}" + libvirt.qemuargs :value => "-device" + nvme_drive = "nvme,drive=#{nvme_disk_id},serial=1234#{index}" + if !nvme_namespaces[index].nil? && nvme_namespaces[index] != 1 + nvme_drive << ",namespaces=#{nvme_namespaces[index]}" + end + + if !nvme_cmbs[index].nil? && nvme_cmbs[index] == "true" + # Fix the size of the buffer to 128M + nvme_drive << ",cmb_size_mb=128" + end + libvirt.qemuargs :value => nvme_drive +end + +def setup_ocssd_disk(libvirt, disk, index) + nvme_disk_id = disk + '-' + index.to_s + nvme_disk = get_nvme_disk(disk, index) + + libvirt.qemuargs :value => "-drive" + libvirt.qemuargs :value => "format=raw,file=#{nvme_disk},if=none,id=#{nvme_disk_id}" + libvirt.qemuargs :value => "-device" + # create ocssd drive with special parameters + # lba_index=4 it is LBA namespace format, 4 means that block size is 4K and have 64B metadata + # lnum_lun, lnum_pln, lpgs_per_blk, lsecs_per_pg, lblks_per_pln this are parameters describing the device geometry + # we need to multiply these parameters by ourselves to have backend file minimal size: + # in our case: 4K * 8 * 2 * 1536 * 2 * 45 = 8640 MB + libvirt.qemuargs :value => "nvme,drive=#{nvme_disk_id},serial=deadbeef,oacs=0,namespaces=1,lver=2,lba_index=4,mdts=10,lnum_lun=8,lnum_pln=2,lpgs_per_blk=1536,lsecs_per_pg=2,lblks_per_pln=45,metadata=#{nvme_disk}_ocssd_md,nsdatafile=#{nvme_disk}_ocssd_blknvme.ns,laer_thread_sleep=3000,stride=4" +end + +def setup_ssh(config) config.ssh.forward_agent = true config.ssh.forward_x11 = true if ENV['VAGRANT_PASSWORD_AUTH'] == "1" config.ssh.username = "vagrant" config.ssh.password = "vagrant" end +end + +def setup_vagrant_cachier(config, plugins_sync_backend) + if Vagrant.has_plugin?("vagrant-cachier") + config.cache.scope = :box + config.cache.synced_folder_opts = plugins_sync_backend + else + puts 'vagrant-cachier caches apt/yum etc to speed subsequent vagrant up' + puts 'to enable install vagrant-cachier plugin: ' + puts 'vagrant plugin install vagrant-cachier' + end +end + +def deploy_test_vm(config) + return unless ENV['DEPLOY_TEST_VM'] == "1" + return unless ENV['SPDK_DIR'] + + config.vm.provision "shell" do |setup| + setup.path = ENV['SPDK_DIR'] + '/test/common/config/vm_setup.sh' + setup.privileged = false + setup.args = ["-u", "-i"] + end +end +def setup_virtualbox(config, vmcpu, vmram) config.vm.provider "virtualbox" do |vb| vb.customize ["modifyvm", :id, "--ioapic", "on"] - vb.memory = "#{vmram}" - vb.cpus = "#{vmcpu}" + vb.memory = vmram + vb.cpus = vmcpu nvme_disk=(ENV['NVME_FILE'] || "nvme_disk.img") unless File.exist? (nvme_disk) @@ -154,22 +224,33 @@ Vagrant.configure(2) do |config| vb.customize ["setextradata", :id, "VBoxInternal/CPUM/SSE4.1", "1"] vb.customize ["setextradata", :id, "VBoxInternal/CPUM/SSE4.2", "1"] end +end + +def setup_libvirt(config, vmcpu, vmram, distro) + emulated_nvme_types=(ENV['NVME_DISKS_TYPE'] || "nvme").split(',') - # This setup was Tested on Fedora 27 - # libvirt configuration need modern Qemu(tested on 2.10) & vagrant-libvirt in version 0.0.39+ - # There are few limitation for SElinux - The file added outside libvirt must have proper SE ACL policy or setenforce 0 config.vm.provider "libvirt" do |libvirt, override| libvirt.random_hostname = "1" - libvirt.disk_bus = "virtio" + libvirt.driver = "kvm" + libvirt.graphics_type = "vnc" + libvirt.memory = vmram + libvirt.cpus = vmcpu + libvirt.video_type = "cirrus" - # generic/freebsd boxes need to be explicitly run with SCSI bus, - # otherwise boot process fails on mounting the disk if (distro.include?("freebsd")) + # generic/freebsd boxes need to be explicitly run with SCSI bus, + # otherwise boot process fails on mounting the disk libvirt.disk_bus = "scsi" + elsif (distro.include?("arch")) + # Run generic/arch boxes explicitly with IDE bus, + # otherwise boot process fails on mounting the disk + libvirt.disk_bus = "ide" + else + libvirt.disk_bus = "virtio" end - if not vmemulator.empty? - libvirt.emulator_path = "#{vmemulator}" + if ENV['SPDK_QEMU_EMULATOR'] + libvirt.emulator_path = ENV['SPDK_QEMU_EMULATOR'] libvirt.machine_type = "pc" end @@ -178,49 +259,13 @@ Vagrant.configure(2) do |config| # Loop to create all emulated disks set emulated_nvme_types.each_with_index { |disk, index| - if ENV['NVME_FILE'] - nvme_disk_id="#{disk}" + "-#{index}" - nvme_disk="#{nvme_file["#{index}".to_i]}" - else - nvme_disk="/var/lib/libvirt/images/nvme_disk.img" - end - - unless File.exist? (nvme_disk) - puts "If run with libvirt provider please execute create_nvme_img.sh" - end - if disk == "nvme" - libvirt.qemuargs :value => "-drive" - libvirt.qemuargs :value => "format=raw,file=#{nvme_disk},if=none,id=#{nvme_disk_id}" - libvirt.qemuargs :value => "-device" - nvme_drive = "nvme,drive#{nvme_disk_id},serial=1234#{index}" - if !nvme_namespaces["#{index}".to_i].nil? && nvme_namespaces["#{index}".to_i] != 1 - nvme_drive << ",namespaces=#{nvme_namespaces["#{index}".to_i]}" - end - if !nvme_cmbs["#{index}".to_i].nil? && nvme_cmbs["#{index}".to_i] == "true" - # Fix the size of the buffer to 128M - nvme_drive << ",cmb_size_mb=128" - end - libvirt.qemuargs :value => nvme_drive + setup_nvme_disk(libvirt, disk, index) elsif disk == "ocssd" - libvirt.qemuargs :value => "-drive" - libvirt.qemuargs :value => "format=raw,file=#{nvme_disk},if=none,id=#{nvme_disk_id}" - libvirt.qemuargs :value => "-device" - # create ocssd drive with special parameters - # lba_index=4 it is LBA namespace format, 4 means that block size is 4K and have 64B metadata - # lnum_lun, lnum_pln, lpgs_per_blk, lsecs_per_pg, lblks_per_pln this are parameters describing the device geometry - # we need to multiply these parameters by ourselves to have backend file minimal size: - # in our case: 4K * 8 * 2 * 1536 * 2 * 45 = 8640 MB - libvirt.qemuargs :value => "nvme,drive=#{nvme_disk_id},serial=deadbeef,oacs=0,namespaces=1,lver=2,lba_index=4,mdts=10,lnum_lun=8,lnum_pln=2,lpgs_per_blk=1536,lsecs_per_pg=2,lblks_per_pln=45,metadata=#{nvme_disk}_ocssd_md,nsdatafile=#{nvme_disk}_ocssd_blknvme.ns,laer_thread_sleep=3000,stride=4" + setup_ocssd_disk(libvirt, disk, index) end } - libvirt.driver = "kvm" - libvirt.graphics_type = "vnc" - libvirt.memory = "#{vmram}" - libvirt.cpus = "#{vmcpu}" - libvirt.video_type = "cirrus" - if ENV['VAGRANT_HUGE_MEM'] == "1" libvirt.memorybacking :hugepages end @@ -228,41 +273,89 @@ Vagrant.configure(2) do |config| # Optional field if we want use other storage pools than default # libvirt.storage_pool_name = "vm" end +end + +################################################################################################# +# Pick the right distro and bootstrap, default is fedora30 +distro = (ENV['SPDK_VAGRANT_DISTRO'] || "fedora30") +provider = (ENV['SPDK_VAGRANT_PROVIDER'] || "virtualbox") + +# Get all variables for creating vm +vmcpu = (ENV['SPDK_VAGRANT_VMCPU'] || 2) +vmram = (ENV['SPDK_VAGRANT_VMRAM'] || 4096) +nfs_sync_backend_distros = ['freebsd', 'clearlinux'] +openstack_network = (ENV['SPDK_OPENSTACK_NETWORK'] || false) + +# generic/freebsd boxes do not work properly with vagrant-proxyconf and +# have issues installing rsync and sshfs for syncing files. NFS is +# pre-installed, so use it. +# generic/fedora boxes on the other hand have problems running NFS +# service so use sshfs+rsync combo instead. +if (nfs_sync_backend_distros.any? { |d| distro.include?(d) }) + files_sync_backend = {type: :nfs, nfs_udp: false, mount_options: ['ro']} + plugins_sync_backend = {type: :nfs, nfs_udp: false} +else + # Remove --copy-links from default rsync cmdline since we do want to sync + # actual symlinks as well. Also, since copy is made between host and its + # local VM we don't need to worry about saturating the local link so skip + # the compression to speed up the whole transfer. + files_sync_backend = {type: "rsync", rsync__auto: false, rsync__args: ["--archive", "--verbose", "--delete"]} + plugins_sync_backend = {type: :sshfs} +end + +Vagrant.configure(2) do |config| + config.vm.box = get_box_type(distro) + config.vm.box_check_update = false + config.vm.synced_folder '.', '/vagrant', disabled: true + + # Add network interface for openstack tests + if openstack_network + config.vm.network "private_network", ip: "10.0.2.15" + end + # Copy in the .gitconfig if it exists + copy_gitconfig(config) + + # Copy the tsocks configuration file for use when installing some spdk test pool dependencies + copy_tsocks(config) + + # Copy in the user's tools if they exists + copy_vagrant_tools(config,files_sync_backend) # rsync the spdk directory if provision hasn't happened yet # Warning: rsync does not work with freebsd boxes, so this step is disabled - if ENV['COPY_SPDK_DIR'] == "1" && spdk_dir != "none" - config.vm.synced_folder "#{spdk_dir}", "/home/vagrant/spdk_repo/spdk", files_sync_backend - end + copy_spdk_dir(config, files_sync_backend) # rsync artifacts from build - if ENV['COPY_SPDK_ARTIFACTS'] == "1" - config.vm.synced_folder "#{vagrantfile_dir}/output", "/home/vagrant/spdk_repo/output", plugins_sync_backend - end + copy_spdk_artifacts(config, plugins_sync_backend) - # provision the vm with all of the necessary spdk dependencies for running the autorun.sh tests - if ENV['DEPLOY_TEST_VM'] == "1" && spdk_dir != "none" - config.vm.provision "shell" do |setup| - setup.path = "#{spdk_dir}/test/common/config/vm_setup.sh" - setup.privileged = false - setup.args = ["-u", "-i"] - end - end + # vagrant-cachier caches apt/yum etc to speed subsequent + # vagrant up + setup_vagrant_cachier(config, plugins_sync_backend) + + # use http proxy if avaiable + setup_proxy(config, distro) # Clear CFLAGS in clear linux - if distro == "clearlinux" - $clearcflags = <<-SCRIPT - echo "export CFLAGS=" >> /etc/profile.d/clearcflags.sh - echo "export CFFLAGS=" >> /etc/profile.d/clearcflags.sh - echo "export CXXFLAGS=" >> /etc/profile.d/clearcflags.sh - echo "export FFLAGS=" >> /etc/profile.d/clearcflags.sh - echo "export THEANO_FLAGS=" >> /etc/profile.d/clearcflags.sh - SCRIPT - config.vm.provision "shell", inline: $clearcflags, run: "always" - end + clear_cflags(config) if distro == 'clearlinux' - # Copy in the user's tools if they exists - if File.directory?(File.expand_path("~/vagrant_tools")) - config.vm.synced_folder "~/vagrant_tools", "/home/vagrant/tools", files_sync_backend - end + # freebsd and clearlinux boxes in order to have spdk sources synced from + # host properly will use NFS with "ro" option enabled to prevent changes + # on host filesystem. + # To make sources usable in the guest VM we need to unmount them and use + # local copy. + make_spdk_local_copy_of_nfs(config,distro) if plugins_sync_backend[:type] == :nfs + + # Setup SSH + setup_ssh(config) + + # Virtualbox configuration + setup_virtualbox(config,vmcpu,vmram) + + # This setup was Tested on Fedora 27 + # libvirt configuration need modern Qemu(tested on 2.10) & vagrant-libvirt in version 0.0.39+ + # There are few limitation for SElinux - The file added outside libvirt must have proper SE ACL policy or setenforce 0 + setup_libvirt(config,vmcpu,vmram,distro) + + # provision the vm with all of the necessary spdk dependencies for running the autorun.sh tests + deploy_test_vm(config) end diff --git a/scripts/vagrant/Vagrantfile_vhost_vm b/scripts/vagrant/Vagrantfile_vhost_vm index b12e8db3423..4daeb105f9a 100644 --- a/scripts/vagrant/Vagrantfile_vhost_vm +++ b/scripts/vagrant/Vagrantfile_vhost_vm @@ -14,6 +14,10 @@ Vagrant.configure(2) do |config| # See: https://app.vagrantup.com/bento/boxes/ubuntu-18.04 config.vm.box = "bento/ubuntu-18.04" config.vm.box_version = "201808.24.0" + when "fedora31" + # See: https://app.vagrantup.com/generic/boxes/fedora31 + config.vm.box = "generic/fedora31" + config.vm.box_version = "2.0.6" else "Invalid argument #{distro}" abort("Invalid argument!") @@ -63,7 +67,20 @@ Vagrant.configure(2) do |config| sudo DEBIAN_FRONTEND=noninteractive apt -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" upgrade sudo apt -y install -y fio sg3-utils bc SCRIPT - config.vm.provision "shell", inline: $apt_script + + $dnf_script = <<-SCRIPT + sudo dnf -y update + sudo dnf -y install fio sg3_utils bc + SCRIPT + + $install_script = case distro + when "ubuntu16" then $apt_script + when "ubuntu18" then $apt_script + when "fedora31" then $dnf_script + else abort("#{distro} distribution is not supported yet") + end + + config.vm.provision "shell", inline: $install_script # Modify GRUB options # console=ttyS0 earlyprintk=ttyS0 - reroute output to serial dev, so that QEMU can write output to file @@ -72,13 +89,23 @@ Vagrant.configure(2) do |config| # Reason for these options is that NIC can have different udev name during provisioning with Vagrant # and then some other name while running SPDK tests which use Qemu without any hypervisor like vbox or libvirt # so no corresponding configuration for this NIC name will be present in /etc. - config.vm.provision "shell", inline: 'sudo sed -ir s#GRUB_CMDLINE_LINUX=\"\"#GRUB_CMDLINE_LINUX=\"console=ttyS0\ earlyprintk=ttyS0\ scsi_mod.use_blk_mq=1\ net.ifnames=0\ biosdevname=0\"#g /etc/default/grub' - config.vm.provision "shell", inline: 'sudo update-grub' + config.vm.provision "shell", inline: 'sudo sed -ir s#GRUB_CMDLINE_LINUX=#GRUB_CMDLINE_LINUX=\"console=ttyS0\ earlyprintk=ttyS0\ scsi_mod.use_blk_mq=1\ net.ifnames=0\ biosdevname=0\"#g /etc/default/grub' + config.vm.provision "shell", inline: 'sudo sed -ir s#\"\"#\ #g /etc/default/grub' + + update_grub_command = case distro + when "ubuntu16" then 'sudo update-grub' + when "ubuntu18" then 'sudo update-grub' + when "fedora31" then 'sudo grub2-mkconfig -o /boot/grub2/grub.cfg ; sudo grub2-mkconfig -o /boot/efi/EFI/fedora/grub.cfg' + else abort("#{distro} distribution is not supported yet") + end + config.vm.provision "shell", inline: update_grub_command - # TODO: Next 2 lines break any future ssh communication via "vagrant ssh" - # I'd be good to check NIC names in ifconfig and then sed them in /etc/network/interfaces to eht0, eht1, and so on - config.vm.provision "shell", inline: 'sudo sh -c "echo \"auto eth0\" >> /etc/network/interfaces"' - config.vm.provision "shell", inline: 'sudo sh -c "echo \"iface eth0 inet dhcp\" >> /etc/network/interfaces"' + if distro.include? "ubuntu" + # TODO: Next 2 lines break any future ssh communication via "vagrant ssh" + # I'd be good to check NIC names in ifconfig and then sed them in /etc/network/interfaces to eht0, eht1, and so on + config.vm.provision "shell", inline: 'sudo sh -c "echo \"auto eth0\" >> /etc/network/interfaces"' + config.vm.provision "shell", inline: 'sudo sh -c "echo \"iface eth0 inet dhcp\" >> /etc/network/interfaces"' + end if distro.include? "ubuntu18" # This is to avoid annoying "Start job is running for wait for network to be configured" 2 minute timeout diff --git a/scripts/vagrant/autorun-spdk.conf b/scripts/vagrant/autorun-spdk.conf index fa40d1dcad5..067522c6ff3 100644 --- a/scripts/vagrant/autorun-spdk.conf +++ b/scripts/vagrant/autorun-spdk.conf @@ -16,6 +16,7 @@ SPDK_TEST_OCF=0 SPDK_TEST_VHOST=0 SPDK_TEST_VHOST_INIT=0 SPDK_TEST_BLOCKDEV=1 +SPDK_TEST_URING=0 # doesn't work on vm SPDK_TEST_IOAT=0 SPDK_TEST_BLOBFS=0 diff --git a/scripts/vagrant/create_vbox.sh b/scripts/vagrant/create_vbox.sh index 055a73e3981..352f7c3a7b0 100755 --- a/scripts/vagrant/create_vbox.sh +++ b/scripts/vagrant/create_vbox.sh @@ -19,8 +19,8 @@ display_help() { echo echo " Usage: ${0##*/} [-b nvme-backing-file] [-n ] [-s ] [-x ] [-hvrldcu] " echo - echo " distro = " + echo " distro = " echo echo " -s in kb Default: ${SPDK_VAGRANT_VMRAM}" echo " -n 1 to 4 Default: ${SPDK_VAGRANT_VMCPU}" @@ -38,6 +38,7 @@ display_help() { echo " -l Use a local copy of spdk, don't try to rsync from the host." echo " -a Copy spdk/autorun.sh artifacts from VM to host system." echo " -d Deploy a test vm by provisioning all prerequisites for spdk autotest" + echo " -o Add network interface for openstack tests" echo " --qemu-emulator= Path to custom QEMU binary. Only works with libvirt provider" echo " --vagrantfiles-dir= Destination directory to put Vagrantfile into." echo " --package-box Install all dependencies for SPDK and create a local vagrant box version." @@ -72,6 +73,7 @@ SPDK_VAGRANT_VMCPU=4 SPDK_VAGRANT_VMRAM=4096 SPDK_VAGRANT_PROVIDER="virtualbox" SPDK_QEMU_EMULATOR="" +SPDK_OPENSTACK_NETWORK=0 OPTIND=1 NVME_DISKS_TYPE="" NVME_DISKS_NAMESPACES="" @@ -82,7 +84,7 @@ VAGRANT_PASSWORD_AUTH=0 VAGRANT_PACKAGE_BOX=0 VAGRANT_HUGE_MEM=0 -while getopts ":b:n:s:x:p:u:vcraldHh-:" opt; do +while getopts ":b:n:s:x:p:u:vcraldoHh-:" opt; do case "${opt}" in -) case "${OPTARG}" in @@ -128,6 +130,9 @@ while getopts ":b:n:s:x:p:u:vcraldHh-:" opt; do d) DEPLOY_TEST_VM=1 ;; + o) + SPDK_OPENSTACK_NETWORK=1 + ;; b) NVME_FILE+="${OPTARG#*=} " ;; @@ -168,6 +173,9 @@ case "${SPDK_VAGRANT_DISTRO}" in fedora31) export SPDK_VAGRANT_DISTRO ;; + fedora32) + export SPDK_VAGRANT_DISTRO + ;; freebsd11) export SPDK_VAGRANT_DISTRO ;; @@ -236,6 +244,7 @@ if [ ${VERBOSE} = 1 ]; then echo SPDK_VAGRANT_PROVIDER=$SPDK_VAGRANT_PROVIDER echo SPDK_VAGRANT_HTTP_PROXY=$SPDK_VAGRANT_HTTP_PROXY echo SPDK_QEMU_EMULATOR=$SPDK_QEMU_EMULATOR + echo SPDK_OPENSTACK_NETWORK=$SPDK_OPENSTACK_NETWORK echo VAGRANT_PACKAGE_BOX=$VAGRANT_PACKAGE_BOX echo fi @@ -244,6 +253,7 @@ export SPDK_VAGRANT_HTTP_PROXY export SPDK_VAGRANT_VMCPU export SPDK_VAGRANT_VMRAM export SPDK_DIR +export SPDK_OPENSTACK_NETWORK export COPY_SPDK_DIR export COPY_SPDK_ARTIFACTS export DEPLOY_TEST_VM @@ -298,11 +308,6 @@ if [ ${DRY_RUN} != 1 ]; then if [ -n "${http_proxy}" ]; then export http_proxy export https_proxy - if vagrant plugin list | grep -q vagrant-proxyconf; then - echo "vagrant-proxyconf already installed... skipping" - else - vagrant plugin install vagrant-proxyconf - fi if echo "$SPDK_VAGRANT_DISTRO" | grep -q freebsd; then cat > ~/vagrant_pkg.conf << EOF pkg_env: { diff --git a/scripts/vagrant/create_vhost_vm.sh b/scripts/vagrant/create_vhost_vm.sh index 5895639cf3f..5ad416c3bcc 100755 --- a/scripts/vagrant/create_vhost_vm.sh +++ b/scripts/vagrant/create_vhost_vm.sh @@ -19,7 +19,7 @@ display_help() { echo echo " Usage: ${0##*/} " echo - echo " distro = " + echo " distro = " echo echo " --use-ssh-dir= Use existing spdk_vhost_id_rsa keys from specified directory" echo " for setting up SSH key pair on VM" @@ -79,6 +79,9 @@ case "${SPDK_VAGRANT_DISTRO}" in ubuntu18) export SPDK_VAGRANT_DISTRO ;; + fedora31) + export SPDK_VAGRANT_DISTRO + ;; *) echo " Invalid argument \"${SPDK_VAGRANT_DISTRO}\"" echo " Try: \"$0 -h\"" >&2 diff --git a/scripts/vagrant/local.conf b/scripts/vagrant/local.conf index f392d4bbbec..0ded726d13e 100644 --- a/scripts/vagrant/local.conf +++ b/scripts/vagrant/local.conf @@ -3,7 +3,7 @@ ADMIN_PASSWORD=secret DATABASE_PASSWORD=secret RABBIT_PASSWORD=secret SERVICE_PASSWORD=secret -HOST_IP=10.0.2.15 +HOST_IP=127.0.0.1 USE_PYTHON3=True # These options define expected driver capabilities diff --git a/test/app/bdev_svc/Makefile b/test/app/bdev_svc/Makefile index 4e6ce2111c8..1736d57f06c 100644 --- a/test/app/bdev_svc/Makefile +++ b/test/app/bdev_svc/Makefile @@ -40,7 +40,7 @@ APP = bdev_svc C_SRCS := bdev_svc.c SPDK_LIB_LIST = $(ALL_MODULES_LIST) -SPDK_LIB_LIST += event_bdev event_accel event_vmd +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) SPDK_LIB_LIST += nvmf event log trace conf thread util bdev accel rpc jsonrpc json sock blobfs_bdev SPDK_LIB_LIST += app_rpc log_rpc bdev_rpc notify diff --git a/test/app/fuzz/iscsi_fuzz/iscsi_fuzz.c b/test/app/fuzz/iscsi_fuzz/iscsi_fuzz.c index 359b95981c2..f222acb48cd 100644 --- a/test/app/fuzz/iscsi_fuzz/iscsi_fuzz.c +++ b/test/app/fuzz/iscsi_fuzz/iscsi_fuzz.c @@ -597,54 +597,54 @@ fuzz_iscsi_send_login_request(struct fuzz_iscsi_dev_ctx *dev_ctx, uint8_t sessio req_pdu->bhs.flags = ISCSI_LOGIN_TRANSIT | (ISCSI_OPERATIONAL_NEGOTIATION_PHASE << 2) | ISCSI_FULL_FEATURE_PHASE; - req_pdu->data_segment_len = iscsi_append_text(conn, "InitiatorName", g_init_name, + req_pdu->data_segment_len = iscsi_append_text("InitiatorName", g_init_name, req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "HeaderDigest", "None", + req_pdu->data_segment_len = iscsi_append_text("HeaderDigest", "None", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "DataDigest", "None", + req_pdu->data_segment_len = iscsi_append_text("DataDigest", "None", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "DefaultTime2Wait", "2", + req_pdu->data_segment_len = iscsi_append_text("DefaultTime2Wait", "2", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "DefaultTime2Retain", "0", + req_pdu->data_segment_len = iscsi_append_text("DefaultTime2Retain", "0", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "IFMarker", "No", + req_pdu->data_segment_len = iscsi_append_text("IFMarker", "No", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "OFMarker", "No", + req_pdu->data_segment_len = iscsi_append_text("OFMarker", "No", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "ErrorRecoveryLevel", "0", + req_pdu->data_segment_len = iscsi_append_text("ErrorRecoveryLevel", "0", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); if (session_type == SESSION_TYPE_DISCOVERY) { /* Discovery PDU */ conn->sess->session_type = SESSION_TYPE_DISCOVERY; - req_pdu->data_segment_len = iscsi_append_text(conn, "SessionType", "Discovery", + req_pdu->data_segment_len = iscsi_append_text("SessionType", "Discovery", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "MaxRecvDataSegmentLength", "32768", + req_pdu->data_segment_len = iscsi_append_text("MaxRecvDataSegmentLength", "32768", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); } else { /* Login PDU */ conn->sess->session_type = SESSION_TYPE_NORMAL; - req_pdu->data_segment_len = iscsi_append_text(conn, "SessionType", "Normal", + req_pdu->data_segment_len = iscsi_append_text("SessionType", "Normal", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "TargetName", g_tgt_name, + req_pdu->data_segment_len = iscsi_append_text("TargetName", g_tgt_name, req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "InitialR2T", "No", + req_pdu->data_segment_len = iscsi_append_text("InitialR2T", "No", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "ImmediateData", "Yes", + req_pdu->data_segment_len = iscsi_append_text("ImmediateData", "Yes", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "MaxBurstLength", "16776192", + req_pdu->data_segment_len = iscsi_append_text("MaxBurstLength", "16776192", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "FirstBurstLength", "262144", + req_pdu->data_segment_len = iscsi_append_text("FirstBurstLength", "262144", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "MaxOutstandingR2T", "1", + req_pdu->data_segment_len = iscsi_append_text("MaxOutstandingR2T", "1", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "MaxConnections", "1", + req_pdu->data_segment_len = iscsi_append_text("MaxConnections", "1", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "DataPDUInOrder", "Yes", + req_pdu->data_segment_len = iscsi_append_text("DataPDUInOrder", "Yes", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "DataSequenceInOrder", "Yes", + req_pdu->data_segment_len = iscsi_append_text("DataSequenceInOrder", "Yes", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); - req_pdu->data_segment_len = iscsi_append_text(conn, "MaxRecvDataSegmentLength", "262144", + req_pdu->data_segment_len = iscsi_append_text("MaxRecvDataSegmentLength", "262144", req_pdu->data, req_pdu->data_buf_len, req_pdu->data_segment_len); } diff --git a/test/app/fuzz/nvme_fuzz/example.json b/test/app/fuzz/nvme_fuzz/example.json index 95540746ec9..21360748d86 100644 --- a/test/app/fuzz/nvme_fuzz/example.json +++ b/test/app/fuzz/nvme_fuzz/example.json @@ -54,7 +54,7 @@ "cdw15": 644909208 }, "struct spdk_nvme_cmd": { - "opc": 12, + "opc": 13, "fuse": 1, "rsvd1": 13, "psdt": 1, @@ -126,7 +126,7 @@ "cdw15": 644909208 }, "struct spdk_nvme_cmd": { - "opc": 12, + "opc": 13, "fuse": 1, "rsvd1": 13, "psdt": 1, @@ -198,7 +198,7 @@ "cdw15": 644909208 }, "struct spdk_nvme_cmd": { - "opc": 12, + "opc": 13, "fuse": 1, "rsvd1": 13, "psdt": 1, @@ -270,7 +270,7 @@ "cdw15": 644909208 }, "struct spdk_nvme_cmd": { - "opc": 12, + "opc": 13, "fuse": 1, "rsvd1": 13, "psdt": 1, diff --git a/test/app/fuzz/nvme_fuzz/nvme_fuzz.c b/test/app/fuzz/nvme_fuzz/nvme_fuzz.c index 911f77ab307..127bc1bff80 100644 --- a/test/app/fuzz/nvme_fuzz/nvme_fuzz.c +++ b/test/app/fuzz/nvme_fuzz/nvme_fuzz.c @@ -456,7 +456,9 @@ submit_qp_cmds(struct nvme_fuzz_ns *ns, struct nvme_fuzz_qp *qp) while ((qp->submitted_cmd_counter < g_cmd_array_size || g_cmd_array_size == 0) && !TAILQ_EMPTY(&qp->free_ctx_objs)) { ctx = TAILQ_FIRST(&qp->free_ctx_objs); - prep_nvme_cmd(ns, qp, ctx); + do { + prep_nvme_cmd(ns, qp, ctx); + } while (qp->is_admin && ctx->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST); TAILQ_REMOVE(&qp->free_ctx_objs, ctx, link); TAILQ_INSERT_HEAD(&qp->outstanding_ctx_objs, ctx, link); diff --git a/test/bdev/bdevio/Makefile b/test/bdev/bdevio/Makefile index 660315de71b..83aca58cabb 100644 --- a/test/bdev/bdevio/Makefile +++ b/test/bdev/bdevio/Makefile @@ -40,7 +40,7 @@ APP = bdevio C_SRCS := bdevio.c SPDK_LIB_LIST = $(ALL_MODULES_LIST) -SPDK_LIB_LIST += event_bdev event_accel event_vmd +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) SPDK_LIB_LIST += app_rpc bdev bdev_rpc accel event trace log conf thread util rpc jsonrpc json sock notify LIBS += -lcunit diff --git a/test/bdev/bdevperf/Makefile b/test/bdev/bdevperf/Makefile index 730d081c3b3..689d7fe109f 100644 --- a/test/bdev/bdevperf/Makefile +++ b/test/bdev/bdevperf/Makefile @@ -40,7 +40,7 @@ APP = bdevperf C_SRCS := bdevperf.c SPDK_LIB_LIST = $(ALL_MODULES_LIST) -SPDK_LIB_LIST += event_bdev event_accel event_vmd +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) SPDK_LIB_LIST += bdev accel event trace log conf thread util sock notify SPDK_LIB_LIST += rpc jsonrpc json app_rpc log_rpc bdev_rpc @@ -48,4 +48,8 @@ ifeq ($(OS),Linux) SPDK_LIB_LIST += event_nbd nbd endif +ifeq ($(SPDK_ROOT_DIR)/lib/env_dpdk,$(CONFIG_ENV)) +SPDK_LIB_LIST += env_dpdk_rpc +endif + include $(SPDK_ROOT_DIR)/mk/spdk.app.mk diff --git a/test/bdev/bdevperf/bdevperf.c b/test/bdev/bdevperf/bdevperf.c index 71ce391d071..ab6dc9f3547 100644 --- a/test/bdev/bdevperf/bdevperf.c +++ b/test/bdev/bdevperf/bdevperf.c @@ -44,6 +44,11 @@ #include "spdk/string.h" #include "spdk/rpc.h" #include "spdk/bit_array.h" +#include "spdk/conf.h" + +#define BDEVPERF_CONFIG_MAX_FILENAME 1024 +#define BDEVPERF_CONFIG_UNDEFINED -1 +#define BDEVPERF_CONFIG_ERROR -2 struct bdevperf_task { struct iovec iov; @@ -62,13 +67,9 @@ static const char *g_workload_type = NULL; static int g_io_size = 0; /* initialize to invalid value so we can detect if user overrides it. */ static int g_rw_percentage = -1; -static int g_is_random; static bool g_verify = false; static bool g_reset = false; static bool g_continue_on_failure = false; -static bool g_unmap = false; -static bool g_write_zeroes = false; -static bool g_flush = false; static bool g_abort = false; static int g_queue_depth = 0; static uint64_t g_time_in_usec; @@ -88,7 +89,10 @@ static bool g_wait_for_tests = false; static struct spdk_jsonrpc_request *g_request = NULL; static bool g_multithread_mode = false; static int g_timeout_in_sec; +static struct spdk_conf *g_bdevperf_conf = NULL; +static const char *g_bdevperf_conf_file = NULL; +static struct spdk_cpuset g_all_cpuset; static struct spdk_poller *g_perf_timer = NULL; static void bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task); @@ -102,6 +106,19 @@ struct bdevperf_job { TAILQ_ENTRY(bdevperf_job) link; struct spdk_thread *thread; + const char *workload_type; + int io_size; + int rw_percentage; + bool is_random; + bool verify; + bool reset; + bool continue_on_failure; + bool unmap; + bool write_zeroes; + bool flush; + bool abort; + int queue_depth; + uint64_t io_completed; uint64_t io_failed; uint64_t io_timeout; @@ -131,6 +148,37 @@ static struct spdk_bdevperf g_bdevperf = { .running_jobs = 0, }; +enum job_config_rw { + JOB_CONFIG_RW_READ = 0, + JOB_CONFIG_RW_WRITE, + JOB_CONFIG_RW_RANDREAD, + JOB_CONFIG_RW_RANDWRITE, + JOB_CONFIG_RW_RW, + JOB_CONFIG_RW_RANDRW, + JOB_CONFIG_RW_VERIFY, + JOB_CONFIG_RW_RESET, + JOB_CONFIG_RW_UNMAP, + JOB_CONFIG_RW_FLUSH, + JOB_CONFIG_RW_WRITE_ZEROES, +}; + +/* Storing values from a section of job config file */ +struct job_config { + const char *name; + const char *filename; + struct spdk_cpuset cpumask; + int bs; + int iodepth; + int rwmixread; + int64_t offset; + int length; + enum job_config_rw rw; + TAILQ_ENTRY(job_config) link; +}; + +TAILQ_HEAD(, job_config) job_config_list + = TAILQ_HEAD_INITIALIZER(job_config_list); + static bool g_performance_dump_active = false; struct bdevperf_aggregate_stats { @@ -186,7 +234,7 @@ performance_dump_job(struct bdevperf_aggregate_stats *stats, struct bdevperf_job } else { io_per_second = get_ema_io_per_second(job, stats->ema_period); } - mb_per_second = io_per_second * g_io_size / (1024 * 1024); + mb_per_second = io_per_second * job->io_size / (1024 * 1024); failed_per_second = (double)job->io_failed * 1000000 / stats->io_time_in_usec; timeout_per_second = (double)job->io_timeout * 1000000 / stats->io_time_in_usec; @@ -294,6 +342,20 @@ verify_data(void *wr_buf, int wr_buf_len, void *rd_buf, int rd_buf_len, int bloc return true; } +static void +free_job_config(void) +{ + struct job_config *config, *tmp; + + spdk_conf_free(g_bdevperf_conf); + g_bdevperf_conf = NULL; + + TAILQ_FOREACH_SAFE(config, &job_config_list, link, tmp) { + TAILQ_REMOVE(&job_config_list, config, link); + free(config); + } +} + static void bdevperf_test_done(void *ctx) { @@ -333,7 +395,7 @@ bdevperf_test_done(void *ctx) free(task); } - if (g_verify) { + if (job->verify) { spdk_bit_array_free(&job->outstanding); } @@ -384,7 +446,7 @@ bdevperf_job_drain(void *ctx) struct bdevperf_job *job = ctx; spdk_poller_unregister(&job->run_timer); - if (g_reset) { + if (job->reset) { spdk_poller_unregister(&job->reset_timer); } @@ -405,7 +467,7 @@ bdevperf_abort_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg job->io_completed++; } else { job->io_failed++; - if (!g_continue_on_failure) { + if (!job->continue_on_failure) { bdevperf_job_drain(job); g_run_rc = -1; } @@ -439,13 +501,13 @@ bdevperf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) md_check = spdk_bdev_get_dif_type(job->bdev) == SPDK_DIF_DISABLE; if (!success) { - if (!g_reset && !g_continue_on_failure) { + if (!job->reset && !job->continue_on_failure) { bdevperf_job_drain(job); g_run_rc = -1; printf("task offset: %lu on job bdev=%s fails\n", task->offset_blocks, job->name); } - } else if (g_verify || g_reset) { + } else if (job->verify || job->reset) { spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt); assert(iovcnt == 1); assert(iovs != NULL); @@ -469,7 +531,7 @@ bdevperf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) job->io_failed++; } - if (g_verify) { + if (job->verify) { assert(task->offset_blocks / job->io_size_blocks >= job->ios_base); offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base; @@ -608,7 +670,7 @@ bdevperf_submit_task(void *arg) rc = bdevperf_generate_dif(task); } if (rc == 0) { - cb_fn = (g_verify || g_reset) ? bdevperf_verify_write_complete : bdevperf_complete; + cb_fn = (job->verify || job->reset) ? bdevperf_verify_write_complete : bdevperf_complete; if (g_zcopy) { spdk_bdev_zcopy_end(task->bdev_io, true, cb_fn, task); @@ -671,7 +733,7 @@ bdevperf_submit_task(void *arg) return; } else if (rc != 0) { printf("Failed to submit bdev_io: %d\n", rc); - if (g_verify) { + if (job->verify) { assert(task->offset_blocks / job->io_size_blocks >= job->ios_base); offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base; @@ -703,8 +765,8 @@ bdevperf_zcopy_get_buf_complete(struct spdk_bdev_io *bdev_io, bool success, void task->bdev_io = bdev_io; task->io_type = SPDK_BDEV_IO_TYPE_WRITE; - if (g_verify || g_reset) { - /* When g_verify or g_reset is enabled, task->buf is used for + if (job->verify || job->reset) { + /* When job->verify or job->reset is enabled, task->buf is used for * verification of read after write. For write I/O, when zcopy APIs * are used, task->buf cannot be used, and data must be written to * the data buffer allocated underneath bdev layer instead. @@ -764,7 +826,7 @@ bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task) { uint64_t offset_in_ios; - if (g_is_random) { + if (job->is_random) { offset_in_ios = rand_r(&seed) % job->size_in_ios; } else { offset_in_ios = job->offset_in_ios++; @@ -773,10 +835,10 @@ bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task) } /* Increment of offset_in_ios if there's already an outstanding IO - * to that location. We only need this with g_verify as random - * offsets are not supported with g_verify at this time. + * to that location. We only need this with job->verify as random + * offsets are not supported with job->verify at this time. */ - if (g_verify) { + if (job->verify) { assert(spdk_bit_array_find_first_clear(job->outstanding, 0) != UINT32_MAX); while (spdk_bit_array_get(job->outstanding, offset_in_ios)) { @@ -795,7 +857,7 @@ bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task) */ task->offset_blocks = (offset_in_ios + job->ios_base) * job->io_size_blocks; - if (g_verify || g_reset) { + if (job->verify || job->reset) { generate_data(task->buf, job->buf_size, spdk_bdev_get_block_size(job->bdev), task->md_buf, spdk_bdev_get_md_size(job->bdev), @@ -808,14 +870,14 @@ bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task) task->iov.iov_len = job->buf_size; task->io_type = SPDK_BDEV_IO_TYPE_WRITE; } - } else if (g_flush) { + } else if (job->flush) { task->io_type = SPDK_BDEV_IO_TYPE_FLUSH; - } else if (g_unmap) { + } else if (job->unmap) { task->io_type = SPDK_BDEV_IO_TYPE_UNMAP; - } else if (g_write_zeroes) { + } else if (job->write_zeroes) { task->io_type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; - } else if ((g_rw_percentage == 100) || - (g_rw_percentage != 0 && ((rand_r(&seed) % 100) < g_rw_percentage))) { + } else if ((job->rw_percentage == 100) || + (job->rw_percentage != 0 && ((rand_r(&seed) % 100) < job->rw_percentage))) { task->io_type = SPDK_BDEV_IO_TYPE_READ; } else { if (g_zcopy) { @@ -882,7 +944,7 @@ bdevperf_timeout_cb(void *cb_arg, struct spdk_bdev_io *bdev_io) job->io_timeout++; - if (job->is_draining || !g_abort || + if (job->is_draining || !job->abort || !spdk_bdev_io_type_supported(job->bdev, SPDK_BDEV_IO_TYPE_ABORT)) { return; } @@ -910,14 +972,14 @@ bdevperf_job_run(void *ctx) /* Start a timer to stop this I/O chain when the run is over */ job->run_timer = SPDK_POLLER_REGISTER(bdevperf_job_drain, job, g_time_in_usec); - if (g_reset) { + if (job->reset) { job->reset_timer = SPDK_POLLER_REGISTER(reset_job, job, 10 * 1000000); } spdk_bdev_set_timeout(job->bdev_desc, g_timeout_in_sec, bdevperf_timeout_cb, job); - for (i = 0; i < g_queue_depth; i++) { + for (i = 0; i < job->queue_depth; i++) { task = bdevperf_job_get_task(job); bdevperf_submit_single(job, task); } @@ -1042,6 +1104,56 @@ _bdevperf_construct_job_done(void *ctx) } } +/* Checkformat will not allow to use inlined type, + this is a workaround */ +typedef struct spdk_thread *spdk_thread_t; + +static spdk_thread_t +construct_job_thread(struct spdk_cpuset *cpumask, const char *tag) +{ + char thread_name[32]; + struct spdk_cpuset tmp; + + /* This function runs on the master thread. */ + assert(g_master_thread == spdk_get_thread()); + + /* Handle default mask */ + if (spdk_cpuset_count(cpumask) == 0) { + cpumask = &g_all_cpuset; + } + + /* Warn user that mask might need to be changed */ + spdk_cpuset_copy(&tmp, cpumask); + spdk_cpuset_or(&tmp, &g_all_cpuset); + if (!spdk_cpuset_equal(&tmp, &g_all_cpuset)) { + fprintf(stderr, "cpumask for '%s' is too big\n", tag); + } + + snprintf(thread_name, sizeof(thread_name), "%s_%s", + tag, + spdk_cpuset_fmt(cpumask)); + + return spdk_thread_create(thread_name, cpumask); +} + +static uint32_t +_get_next_core(void) +{ + static uint32_t current_core = SPDK_ENV_LCORE_ID_ANY; + + if (current_core == SPDK_ENV_LCORE_ID_ANY) { + current_core = spdk_env_get_first_core(); + return current_core; + } + + current_core = spdk_env_get_next_core(current_core); + if (current_core == SPDK_ENV_LCORE_ID_ANY) { + current_core = spdk_env_get_first_core(); + } + + return current_core; +} + static void _bdevperf_construct_job(void *ctx) { @@ -1067,42 +1179,64 @@ _bdevperf_construct_job(void *ctx) spdk_thread_send_msg(g_master_thread, _bdevperf_construct_job_done, NULL); } +static void +job_init_rw(struct bdevperf_job *job, enum job_config_rw rw) +{ + switch (rw) { + case JOB_CONFIG_RW_READ: + job->rw_percentage = 100; + break; + case JOB_CONFIG_RW_WRITE: + job->rw_percentage = 0; + break; + case JOB_CONFIG_RW_RANDREAD: + job->is_random = true; + job->rw_percentage = 100; + break; + case JOB_CONFIG_RW_RANDWRITE: + job->is_random = true; + job->rw_percentage = 0; + break; + case JOB_CONFIG_RW_RW: + job->is_random = false; + break; + case JOB_CONFIG_RW_RANDRW: + job->is_random = true; + break; + case JOB_CONFIG_RW_VERIFY: + job->verify = true; + job->rw_percentage = 50; + break; + case JOB_CONFIG_RW_RESET: + job->reset = true; + job->verify = true; + job->rw_percentage = 50; + break; + case JOB_CONFIG_RW_UNMAP: + job->unmap = true; + break; + case JOB_CONFIG_RW_FLUSH: + job->flush = true; + break; + case JOB_CONFIG_RW_WRITE_ZEROES: + job->write_zeroes = true; + break; + } +} + static int -bdevperf_construct_job(struct spdk_bdev *bdev, struct spdk_cpuset *cpumask, - uint32_t offset, uint32_t length) +bdevperf_construct_job(struct spdk_bdev *bdev, struct job_config *config, + struct spdk_thread *thread) { struct bdevperf_job *job; struct bdevperf_task *task; int block_size, data_block_size; int rc; int task_num, n; - char thread_name[32]; - struct spdk_thread *thread; - - /* This function runs on the master thread. */ - assert(g_master_thread == spdk_get_thread()); - - snprintf(thread_name, sizeof(thread_name), "%s_%s", spdk_bdev_get_name(bdev), - spdk_cpuset_fmt(cpumask)); - - /* Create a new thread for the job */ - thread = spdk_thread_create(thread_name, cpumask); - assert(thread != NULL); block_size = spdk_bdev_get_block_size(bdev); data_block_size = spdk_bdev_get_data_block_size(bdev); - if (g_unmap && !spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { - printf("Skipping %s because it does not support unmap\n", spdk_bdev_get_name(bdev)); - return -ENOTSUP; - } - - if ((g_io_size % data_block_size) != 0) { - SPDK_ERRLOG("IO size (%d) is not multiples of data block size of bdev %s (%"PRIu32")\n", - g_io_size, spdk_bdev_get_name(bdev), data_block_size); - return -ENOTSUP; - } - job = calloc(1, sizeof(struct bdevperf_job)); if (!job) { fprintf(stderr, "Unable to allocate memory for new job.\n"); @@ -1116,9 +1250,30 @@ bdevperf_construct_job(struct spdk_bdev *bdev, struct spdk_cpuset *cpumask, return -ENOMEM; } + job->workload_type = g_workload_type; + job->io_size = config->bs; + job->rw_percentage = config->rwmixread; + job->continue_on_failure = g_continue_on_failure; + job->queue_depth = config->iodepth; job->bdev = bdev; - job->io_size_blocks = g_io_size / data_block_size; + job->io_size_blocks = job->io_size / data_block_size; job->buf_size = job->io_size_blocks * block_size; + job_init_rw(job, config->rw); + + if ((job->io_size % data_block_size) != 0) { + SPDK_ERRLOG("IO size (%d) is not multiples of data block size of bdev %s (%"PRIu32")\n", + job->io_size, spdk_bdev_get_name(bdev), data_block_size); + free(job->name); + free(job); + return -ENOTSUP; + } + + if (job->unmap && !spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + printf("Skipping %s because it does not support unmap\n", spdk_bdev_get_name(bdev)); + free(job->name); + free(job); + return -ENOTSUP; + } if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)) { job->dif_check_flags |= SPDK_DIF_FLAGS_REFTAG_CHECK; @@ -1129,17 +1284,17 @@ bdevperf_construct_job(struct spdk_bdev *bdev, struct spdk_cpuset *cpumask, job->offset_in_ios = 0; - if (length != 0) { + if (config->length != 0) { /* Use subset of disk */ - job->size_in_ios = length / job->io_size_blocks; - job->ios_base = offset / job->io_size_blocks; + job->size_in_ios = config->length / job->io_size_blocks; + job->ios_base = config->offset / job->io_size_blocks; } else { /* Use whole disk */ job->size_in_ios = spdk_bdev_get_num_blocks(bdev) / job->io_size_blocks; job->ios_base = 0; } - if (g_verify) { + if (job->verify) { job->outstanding = spdk_bit_array_create(job->size_in_ios); if (job->outstanding == NULL) { SPDK_ERRLOG("Could not create outstanding array bitmap for bdev %s\n", @@ -1152,12 +1307,12 @@ bdevperf_construct_job(struct spdk_bdev *bdev, struct spdk_cpuset *cpumask, TAILQ_INIT(&job->task_list); - task_num = g_queue_depth; - if (g_reset) { + task_num = job->queue_depth; + if (job->reset) { task_num += 1; } - if (g_abort) { - task_num += g_queue_depth; + if (job->abort) { + task_num += job->queue_depth; } TAILQ_INSERT_TAIL(&g_bdevperf.jobs, job, link); @@ -1203,16 +1358,148 @@ bdevperf_construct_job(struct spdk_bdev *bdev, struct spdk_cpuset *cpumask, return rc; } +static int +parse_rw(const char *str, enum job_config_rw ret) +{ + if (str == NULL) { + return ret; + } + + if (!strcmp(str, "read")) { + ret = JOB_CONFIG_RW_READ; + } else if (!strcmp(str, "randread")) { + ret = JOB_CONFIG_RW_RANDREAD; + } else if (!strcmp(str, "write")) { + ret = JOB_CONFIG_RW_WRITE; + } else if (!strcmp(str, "randwrite")) { + ret = JOB_CONFIG_RW_RANDWRITE; + } else if (!strcmp(str, "verify")) { + ret = JOB_CONFIG_RW_VERIFY; + } else if (!strcmp(str, "reset")) { + ret = JOB_CONFIG_RW_RESET; + } else if (!strcmp(str, "unmap")) { + ret = JOB_CONFIG_RW_UNMAP; + } else if (!strcmp(str, "write_zeroes")) { + ret = JOB_CONFIG_RW_WRITE_ZEROES; + } else if (!strcmp(str, "flush")) { + ret = JOB_CONFIG_RW_FLUSH; + } else if (!strcmp(str, "rw")) { + ret = JOB_CONFIG_RW_RW; + } else if (!strcmp(str, "randrw")) { + ret = JOB_CONFIG_RW_RANDRW; + } else { + fprintf(stderr, "rw must be one of\n" + "(read, write, randread, randwrite, rw, randrw, verify, reset, unmap, flush)\n"); + ret = BDEVPERF_CONFIG_ERROR; + } + + return ret; +} + +static const char * +config_filename_next(const char *filename, char *out) +{ + int i, k; + + if (filename == NULL) { + out[0] = '\0'; + return NULL; + } + + if (filename[0] == ':') { + filename++; + } + + for (i = 0, k = 0; + filename[i] != '\0' && + filename[i] != ':' && + i < BDEVPERF_CONFIG_MAX_FILENAME; + i++) { + if (filename[i] == ' ' || filename[i] == '\t') { + continue; + } + + out[k++] = filename[i]; + } + out[k] = 0; + + return filename + i; +} + +static void +bdevperf_construct_config_jobs(void) +{ + char filename[BDEVPERF_CONFIG_MAX_FILENAME]; + struct spdk_thread *thread; + struct job_config *config; + struct spdk_bdev *bdev; + const char *filenames; + int rc; + + TAILQ_FOREACH(config, &job_config_list, link) { + filenames = config->filename; + + thread = construct_job_thread(&config->cpumask, config->name); + assert(thread); + + while (filenames) { + filenames = config_filename_next(filenames, filename); + if (strlen(filename) == 0) { + break; + } + + bdev = spdk_bdev_get_by_name(filename); + if (!bdev) { + fprintf(stderr, "Unable to find bdev '%s'\n", filename); + g_run_rc = -EINVAL; + return; + } + + rc = bdevperf_construct_job(bdev, config, thread); + if (rc < 0) { + g_run_rc = rc; + return; + } + } + } +} + +static int +make_cli_job_config(const char *filename, int64_t offset, int range) +{ + struct job_config *config = calloc(1, sizeof(*config)); + + if (config == NULL) { + fprintf(stderr, "Unable to allocate memory for job config\n"); + return -ENOMEM; + } + + config->name = filename; + config->filename = filename; + spdk_cpuset_zero(&config->cpumask); + spdk_cpuset_set_cpu(&config->cpumask, _get_next_core(), true); + config->bs = g_io_size; + config->iodepth = g_queue_depth; + config->rwmixread = g_rw_percentage; + config->offset = offset; + config->length = range; + config->rw = parse_rw(g_workload_type, BDEVPERF_CONFIG_ERROR); + if ((int)config->rw == BDEVPERF_CONFIG_ERROR) { + return -EINVAL; + } + + TAILQ_INSERT_TAIL(&job_config_list, config, link); + return 0; +} + static void bdevperf_construct_multithread_jobs(void) { struct spdk_bdev *bdev; uint32_t i; - struct spdk_cpuset cpumask; uint32_t num_cores; uint32_t blocks_per_job; - uint32_t offset; - int rc; + int64_t offset; num_cores = 0; SPDK_ENV_FOREACH_CORE(i) { @@ -1235,14 +1522,9 @@ bdevperf_construct_multithread_jobs(void) offset = 0; SPDK_ENV_FOREACH_CORE(i) { - spdk_cpuset_zero(&cpumask); - spdk_cpuset_set_cpu(&cpumask, i, true); - - /* Construct the job */ - rc = bdevperf_construct_job(bdev, &cpumask, offset, blocks_per_job); - if (rc < 0) { - g_run_rc = rc; - break; + g_run_rc = make_cli_job_config(g_job_bdev_name, offset, blocks_per_job); + if (g_run_rc) { + return; } offset += blocks_per_job; @@ -1254,60 +1536,36 @@ bdevperf_construct_multithread_jobs(void) offset = 0; SPDK_ENV_FOREACH_CORE(i) { - spdk_cpuset_zero(&cpumask); - spdk_cpuset_set_cpu(&cpumask, i, true); - - /* Construct the job */ - rc = bdevperf_construct_job(bdev, &cpumask, offset, blocks_per_job); - if (rc < 0) { - g_run_rc = rc; - break; + g_run_rc = make_cli_job_config(spdk_bdev_get_name(bdev), + offset, blocks_per_job); + if (g_run_rc) { + return; } offset += blocks_per_job; } - if (g_run_rc != 0) { - break; - } - bdev = spdk_bdev_next_leaf(bdev); } } } -static uint32_t -_get_next_core(void) -{ - static uint32_t current_core = SPDK_ENV_LCORE_ID_ANY; - - if (current_core == SPDK_ENV_LCORE_ID_ANY) { - current_core = spdk_env_get_first_core(); - return current_core; - } - - current_core = spdk_env_get_next_core(current_core); - if (current_core == SPDK_ENV_LCORE_ID_ANY) { - current_core = spdk_env_get_first_core(); - } - - return current_core; -} - static void bdevperf_construct_jobs(void) { struct spdk_bdev *bdev; - uint32_t lcore; - struct spdk_cpuset cpumask; - int rc; - /* There are two entirely separate modes for allocating jobs. Standard mode + /* There are three different modes for allocating jobs. Standard mode * (the default) creates one spdk_thread per bdev and runs the I/O job there. * * The -C flag places bdevperf into "multithread" mode, meaning it creates * one spdk_thread per bdev PER CORE, and runs a copy of the job on each. * This runs multiple threads per bdev, effectively. + * + * The -j flag implies "FIO" mode which tries to mimic semantic of FIO jobs. + * In "FIO" mode, threads are spawned per-job instead of per-bdev. + * Each FIO job can be individually parameterized by filename, cpu mask, etc, + * which is different from other modes in that they only support global options. */ /* Increment initial construct_jobs count so that it will never reach 0 in the middle @@ -1315,7 +1573,9 @@ bdevperf_construct_jobs(void) */ g_construct_job_count = 1; - if (g_multithread_mode) { + if (g_bdevperf_conf) { + goto end; + } else if (g_multithread_mode) { bdevperf_construct_multithread_jobs(); goto end; } @@ -1323,16 +1583,8 @@ bdevperf_construct_jobs(void) if (g_job_bdev_name != NULL) { bdev = spdk_bdev_get_by_name(g_job_bdev_name); if (bdev) { - lcore = _get_next_core(); - - spdk_cpuset_zero(&cpumask); - spdk_cpuset_set_cpu(&cpumask, lcore, true); - /* Construct the job */ - rc = bdevperf_construct_job(bdev, &cpumask, 0, 0); - if (rc < 0) { - g_run_rc = rc; - } + g_run_rc = make_cli_job_config(g_job_bdev_name, 0, 0); } else { fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name); } @@ -1340,15 +1592,9 @@ bdevperf_construct_jobs(void) bdev = spdk_bdev_first_leaf(); while (bdev != NULL) { - lcore = _get_next_core(); - - spdk_cpuset_zero(&cpumask); - spdk_cpuset_set_cpu(&cpumask, lcore, true); - /* Construct the job */ - rc = bdevperf_construct_job(bdev, &cpumask, 0, 0); - if (rc < 0) { - g_run_rc = rc; + g_run_rc = make_cli_job_config(spdk_bdev_get_name(bdev), 0, 0); + if (g_run_rc) { break; } @@ -1357,6 +1603,10 @@ bdevperf_construct_jobs(void) } end: + if (g_run_rc == 0) { + bdevperf_construct_config_jobs(); + } + if (--g_construct_job_count == 0) { if (g_run_rc != 0) { /* Something failed. */ @@ -1368,11 +1618,232 @@ bdevperf_construct_jobs(void) } } +static int +parse_uint_option(struct spdk_conf_section *s, const char *name, int def) +{ + const char *job_name; + int tmp; + + tmp = spdk_conf_section_get_intval(s, name); + if (tmp == -1) { + /* Field was not found. Check default value + * In [global] section it is ok to have undefined values + * but for other sections it is not ok */ + if (def == BDEVPERF_CONFIG_UNDEFINED) { + job_name = spdk_conf_section_get_name(s); + if (strcmp(job_name, "global") == 0) { + return def; + } + + fprintf(stderr, + "Job '%s' has no '%s' assigned\n", + job_name, name); + return BDEVPERF_CONFIG_ERROR; + } + return def; + } + + /* NOTE: get_intval returns nonnegative on success */ + if (tmp < 0) { + fprintf(stderr, "Job '%s' has bad '%s' value.\n", + spdk_conf_section_get_name(s), name); + return BDEVPERF_CONFIG_ERROR; + } + + return tmp; +} + +/* CLI arguments override parameters for global sections */ +static void +config_set_cli_args(struct job_config *config) +{ + if (g_job_bdev_name) { + config->filename = g_job_bdev_name; + } + if (g_io_size > 0) { + config->bs = g_io_size; + } + if (g_queue_depth > 0) { + config->iodepth = g_queue_depth; + } + if (g_rw_percentage > 0) { + config->rwmixread = g_rw_percentage; + } + if (g_workload_type) { + config->rw = parse_rw(g_workload_type, config->rw); + } +} + +static int +read_job_config(void) +{ + struct job_config global_default_config; + struct job_config global_config; + struct spdk_conf_section *s; + struct job_config *config; + const char *cpumask; + const char *rw; + bool is_global; + int n = 0; + + if (g_bdevperf_conf_file == NULL) { + return 0; + } + + g_bdevperf_conf = spdk_conf_allocate(); + if (g_bdevperf_conf == NULL) { + fprintf(stderr, "Could not allocate job config structure\n"); + return 1; + } + + spdk_conf_disable_sections_merge(g_bdevperf_conf); + if (spdk_conf_read(g_bdevperf_conf, g_bdevperf_conf_file)) { + fprintf(stderr, "Invalid job config"); + return 1; + } + + /* Initialize global defaults */ + global_default_config.filename = NULL; + /* Zero mask is the same as g_all_cpuset + * The g_all_cpuset is not initialized yet, + * so use zero mask as the default instead */ + spdk_cpuset_zero(&global_default_config.cpumask); + global_default_config.bs = BDEVPERF_CONFIG_UNDEFINED; + global_default_config.iodepth = BDEVPERF_CONFIG_UNDEFINED; + /* bdevperf has no default for -M option but in FIO the default is 50 */ + global_default_config.rwmixread = 50; + global_default_config.offset = 0; + /* length 0 means 100% */ + global_default_config.length = 0; + global_default_config.rw = BDEVPERF_CONFIG_UNDEFINED; + config_set_cli_args(&global_default_config); + + if ((int)global_default_config.rw == BDEVPERF_CONFIG_ERROR) { + return 1; + } + + /* There is only a single instance of global job_config + * We just reset its value when we encounter new [global] section */ + global_config = global_default_config; + + for (s = spdk_conf_first_section(g_bdevperf_conf); + s != NULL; + s = spdk_conf_next_section(s)) { + config = calloc(1, sizeof(*config)); + if (config == NULL) { + fprintf(stderr, "Unable to allocate memory for job config\n"); + return 1; + } + + config->name = spdk_conf_section_get_name(s); + is_global = strcmp(config->name, "global") == 0; + + if (is_global) { + global_config = global_default_config; + } + + config->filename = spdk_conf_section_get_val(s, "filename"); + if (config->filename == NULL) { + config->filename = global_config.filename; + } + if (!is_global) { + if (config->filename == NULL) { + fprintf(stderr, "Job '%s' expects 'filename' parameter\n", config->name); + goto error; + } else if (strnlen(config->filename, BDEVPERF_CONFIG_MAX_FILENAME) + >= BDEVPERF_CONFIG_MAX_FILENAME) { + fprintf(stderr, + "filename for '%s' job is too long. Max length is %d\n", + config->name, BDEVPERF_CONFIG_MAX_FILENAME); + goto error; + } + } + + cpumask = spdk_conf_section_get_val(s, "cpumask"); + if (cpumask == NULL) { + config->cpumask = global_config.cpumask; + } else if (spdk_cpuset_parse(&config->cpumask, cpumask)) { + fprintf(stderr, "Job '%s' has bad 'cpumask' value\n", config->name); + goto error; + } + + config->bs = parse_uint_option(s, "bs", global_config.bs); + if (config->bs == BDEVPERF_CONFIG_ERROR) { + goto error; + } else if (config->bs == 0) { + fprintf(stderr, "'bs' of job '%s' must be greater than 0\n", config->name); + goto error; + } + + config->iodepth = parse_uint_option(s, "iodepth", global_config.iodepth); + if (config->iodepth == BDEVPERF_CONFIG_ERROR) { + goto error; + } else if (config->iodepth == 0) { + fprintf(stderr, + "'iodepth' of job '%s' must be greater than 0\n", + config->name); + goto error; + } + + config->rwmixread = parse_uint_option(s, "rwmixread", global_config.rwmixread); + if (config->rwmixread == BDEVPERF_CONFIG_ERROR) { + goto error; + } else if (config->rwmixread > 100) { + fprintf(stderr, + "'rwmixread' value of '%s' job is not in 0-100 range\n", + config->name); + goto error; + } + + config->offset = parse_uint_option(s, "offset", global_config.offset); + if (config->offset == BDEVPERF_CONFIG_ERROR) { + goto error; + } + + config->length = parse_uint_option(s, "length", global_config.length); + if (config->length == BDEVPERF_CONFIG_ERROR) { + goto error; + } + + rw = spdk_conf_section_get_val(s, "rw"); + config->rw = parse_rw(rw, global_config.rw); + if ((int)config->rw == BDEVPERF_CONFIG_ERROR) { + fprintf(stderr, "Job '%s' has bad 'rw' value\n", config->name); + goto error; + } else if (!is_global && (int)config->rw == BDEVPERF_CONFIG_UNDEFINED) { + fprintf(stderr, "Job '%s' has no 'rw' assigned\n", config->name); + goto error; + } + + if (is_global) { + config_set_cli_args(config); + global_config = *config; + free(config); + } else { + TAILQ_INSERT_TAIL(&job_config_list, config, link); + n++; + } + } + + printf("Using job config with %d jobs\n", n); + return 0; +error: + free(config); + return 1; +} + static void bdevperf_run(void *arg1) { + uint32_t i; + g_master_thread = spdk_get_thread(); + spdk_cpuset_zero(&g_all_cpuset); + SPDK_ENV_FOREACH_CORE(i) { + spdk_cpuset_set_cpu(&g_all_cpuset, i, true); + } + if (g_wait_for_tests) { /* Do not perform any tests until RPC is received */ return; @@ -1466,6 +1937,8 @@ bdevperf_parse_arg(int ch, char *arg) g_multithread_mode = true; } else if (ch == 'f') { g_continue_on_failure = true; + } else if (ch == 'j') { + g_bdevperf_conf_file = optarg; } else { tmp = spdk_strtoll(optarg, 10); if (tmp < 0) { @@ -1527,6 +2000,7 @@ bdevperf_usage(void) printf(" -z start bdevperf, but wait for RPC to start tests\n"); printf(" -A abort the timeout I/O\n"); printf(" -C enable every core to send I/Os to each bdev\n"); + printf(" -j use job config file"); } static int @@ -1539,17 +2013,17 @@ verify_test_params(struct spdk_app_opts *opts) opts->rpc_addr = SPDK_DEFAULT_RPC_ADDR; } - if (g_queue_depth <= 0) { + if (!g_bdevperf_conf_file && g_queue_depth <= 0) { spdk_app_usage(); bdevperf_usage(); return 1; } - if (g_io_size <= 0) { + if (!g_bdevperf_conf_file && g_io_size <= 0) { spdk_app_usage(); bdevperf_usage(); return 1; } - if (!g_workload_type) { + if (!g_bdevperf_conf_file && !g_workload_type) { spdk_app_usage(); bdevperf_usage(); return 1; @@ -1573,43 +2047,16 @@ verify_test_params(struct spdk_app_opts *opts) return 1; } - if (strcmp(g_workload_type, "read") && - strcmp(g_workload_type, "write") && - strcmp(g_workload_type, "randread") && - strcmp(g_workload_type, "randwrite") && - strcmp(g_workload_type, "rw") && - strcmp(g_workload_type, "randrw") && - strcmp(g_workload_type, "verify") && - strcmp(g_workload_type, "reset") && - strcmp(g_workload_type, "unmap") && - strcmp(g_workload_type, "write_zeroes") && - strcmp(g_workload_type, "flush")) { - fprintf(stderr, - "io pattern type must be one of\n" - "(read, write, randread, randwrite, rw, randrw, verify, reset, unmap, flush)\n"); - return 1; - } - - if (!strcmp(g_workload_type, "read") || - !strcmp(g_workload_type, "randread")) { - g_rw_percentage = 100; - } - - if (!strcmp(g_workload_type, "write") || - !strcmp(g_workload_type, "randwrite")) { - g_rw_percentage = 0; - } - - if (!strcmp(g_workload_type, "unmap")) { - g_unmap = true; - } - - if (!strcmp(g_workload_type, "write_zeroes")) { - g_write_zeroes = true; + if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) { + printf("I/O size of %d is greater than zero copy threshold (%d).\n", + g_io_size, SPDK_BDEV_LARGE_BUF_MAX_SIZE); + printf("Zero copy mechanism will not be used.\n"); + g_zcopy = false; } - if (!strcmp(g_workload_type, "flush")) { - g_flush = true; + if (g_bdevperf_conf_file) { + /* workload_type verification happens during config file parsing */ + return 0; } if (!strcmp(g_workload_type, "verify") || @@ -1651,25 +2098,6 @@ verify_test_params(struct spdk_app_opts *opts) } } - if (!strcmp(g_workload_type, "read") || - !strcmp(g_workload_type, "write") || - !strcmp(g_workload_type, "rw") || - !strcmp(g_workload_type, "verify") || - !strcmp(g_workload_type, "reset") || - !strcmp(g_workload_type, "unmap") || - !strcmp(g_workload_type, "write_zeroes")) { - g_is_random = 0; - } else { - g_is_random = 1; - } - - if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) { - printf("I/O size of %d is greater than zero copy threshold (%d).\n", - g_io_size, SPDK_BDEV_LARGE_BUF_MAX_SIZE); - printf("Zero copy mechanism will not be used.\n"); - g_zcopy = false; - } - return 0; } @@ -1685,18 +2113,25 @@ main(int argc, char **argv) opts.reactor_mask = NULL; opts.shutdown_cb = spdk_bdevperf_shutdown_cb; - if ((rc = spdk_app_parse_args(argc, argv, &opts, "xzfq:o:t:w:k:ACM:P:S:T:", NULL, + if ((rc = spdk_app_parse_args(argc, argv, &opts, "xzfq:o:t:w:k:ACM:P:S:T:j:", NULL, bdevperf_parse_arg, bdevperf_usage)) != SPDK_APP_PARSE_ARGS_SUCCESS) { return rc; } + if (read_job_config()) { + free_job_config(); + return 1; + } + if (verify_test_params(&opts) != 0) { + free_job_config(); exit(1); } rc = spdk_app_start(&opts, bdevperf_run, NULL); spdk_app_fini(); + free_job_config(); return rc; } diff --git a/test/bdev/bdevperf/common.sh b/test/bdev/bdevperf/common.sh new file mode 100644 index 00000000000..eade380a330 --- /dev/null +++ b/test/bdev/bdevperf/common.sh @@ -0,0 +1,33 @@ +bdevperf=$rootdir/test/bdev/bdevperf/bdevperf + +function create_job() { + local job_section=$1 + local rw=$2 + local filename=$3 + + if [[ $job_section == "global" ]]; then + cat <<- EOF >> "$testdir"/test.conf + [global] + filename=${filename} + EOF + fi + job="[${job_section}]" + echo $global + cat <<- EOF >> "$testdir"/test.conf + ${job} + filename=${filename} + bs=1024 + rwmixread=70 + rw=${rw} + iodepth=256 + cpumask=0xff + EOF +} + +function get_num_jobs() { + echo "$1" | grep -oE "Using job config with [0-9]+ jobs" | grep -oE "[0-9]+" +} + +function cleanup() { + rm -f $testdir/test.conf +} diff --git a/test/bdev/bdevperf/conf.json b/test/bdev/bdevperf/conf.json new file mode 100644 index 00000000000..c58407f380a --- /dev/null +++ b/test/bdev/bdevperf/conf.json @@ -0,0 +1,25 @@ +{ + "subsystems": [ + { + "subsystem": "bdev", + "config": [ + { + "method": "bdev_malloc_create", + "params": { + "name": "Malloc0", + "num_blocks": 102400, + "block_size": 512 + } + }, + { + "method": "bdev_malloc_create", + "params": { + "name": "Malloc1", + "num_blocks": 102400, + "block_size": 512 + } + } + ] + } + ] +} diff --git a/test/bdev/bdevperf/test_config.sh b/test/bdev/bdevperf/test_config.sh new file mode 100755 index 00000000000..911d4e27d3d --- /dev/null +++ b/test/bdev/bdevperf/test_config.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +testdir=$(readlink -f $(dirname $0)) +rootdir=$(readlink -f $testdir/../../..) +source $rootdir/test/common/autotest_common.sh +source $testdir/common.sh + +jsonconf=$testdir/conf.json +testconf=$testdir/test.conf + +trap 'cleanup; exit 1' SIGINT SIGTERM EXIT +#Test inheriting filename and rw_mode parameters from global section. +create_job "global" "read" "Malloc0" +create_job "job0" +create_job "job1" +create_job "job2" +create_job "job3" +bdevperf_output=$($bdevperf -t 2 --json $jsonconf -j $testconf 2>&1) +[[ $(get_num_jobs "$bdevperf_output") == "4" ]] + +bdevperf_output=$($bdevperf -C -t 2 --json $jsonconf -j $testconf) + +cleanup +#Test missing global section. +create_job "job0" "write" "Malloc0" +create_job "job1" "write" "Malloc0" +create_job "job2" "write" "Malloc0" +bdevperf_output=$($bdevperf -t 2 --json $jsonconf -j $testconf 2>&1) +[[ $(get_num_jobs "$bdevperf_output") == "3" ]] + +cleanup +#Test inheriting multiple filenames and rw_mode parameters from global section. +create_job "global" "rw" "Malloc0:Malloc1" +create_job "job0" +create_job "job1" +create_job "job2" +create_job "job3" +bdevperf_output=$($bdevperf -t 2 --json $jsonconf -j $testconf 2>&1) +[[ $(get_num_jobs "$bdevperf_output") == "4" ]] +cleanup +trap - SIGINT SIGTERM EXIT diff --git a/test/bdev/blockdev.sh b/test/bdev/blockdev.sh index 12d9c6f52ad..26fbab8da8b 100755 --- a/test/bdev/blockdev.sh +++ b/test/bdev/blockdev.sh @@ -40,9 +40,8 @@ function setup_bdev_conf() { bdev_passthru_create -p TestPT -b Malloc3 bdev_raid_create -n raid0 -z 64 -r 0 -b "Malloc4 Malloc5" RPC - # FIXME: QoS doesn't work properly with json_config, see issue 1146 - #$rpc_py bdev_set_qos_limit --rw_mbytes_per_sec 100 Malloc3 - #$rpc_py bdev_set_qos_limit --rw_ios_per_sec 20000 Malloc0 + $rpc_py bdev_set_qos_limit --rw_mbytes_per_sec 100 Malloc3 + $rpc_py bdev_set_qos_limit --rw_ios_per_sec 20000 Malloc0 if [[ $(uname -s) != "FreeBSD" ]]; then dd if=/dev/zero of="$SPDK_TEST_STORAGE/aiofile" bs=2048 count=5000 "$rpc_py" bdev_aio_create "$SPDK_TEST_STORAGE/aiofile" AIO0 2048 @@ -56,11 +55,6 @@ function setup_nvme_conf() { function setup_gpt_conf() { if [[ $(uname -s) = Linux ]] && hash sgdisk; then $rootdir/scripts/setup.sh reset - # FIXME: Note that we are racing with the kernel here. There's no guarantee that - # proper object will be already in place under sysfs nor that any udev-like - # helper created proper block devices for us. Replace the below sleep with proper - # udev settle routine. - sleep 1s # Get nvme devices by following drivers' links towards nvme class local nvme_devs=(/sys/bus/pci/drivers/nvme/*/nvme/nvme*/nvme*n*) nvme_dev gpt_nvme="" @@ -399,7 +393,6 @@ fi #----------------------------------------------------- if [ "$test_type" = "gpt" ]; then "$rootdir/scripts/setup.sh" reset - sleep 1s if [[ -b $gpt_nvme ]]; then dd if=/dev/zero of="$gpt_nvme" bs=4096 count=8 oflag=direct fi diff --git a/test/blobfs/fuse/Makefile b/test/blobfs/fuse/Makefile index 75f19cf88d5..09d956e4f58 100644 --- a/test/blobfs/fuse/Makefile +++ b/test/blobfs/fuse/Makefile @@ -40,7 +40,7 @@ APP = fuse C_SRCS := fuse.c SPDK_LIB_LIST = $(ALL_MODULES_LIST) -SPDK_LIB_LIST += event_bdev event_accel event_vmd +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) SPDK_LIB_LIST += bdev accel event thread util conf trace \ log jsonrpc json rpc sock notify blobfs_bdev diff --git a/test/blobfs/fuse/fuse.c b/test/blobfs/fuse/fuse.c index e434fb505d3..2c1f7da6834 100644 --- a/test/blobfs/fuse/fuse.c +++ b/test/blobfs/fuse/fuse.c @@ -56,7 +56,7 @@ static void fuse_run_cb(void *cb_arg, int fserrno) { if (fserrno) { - printf("Failed to mount filesystem on bdev %s to path %s: %s", + printf("Failed to mount filesystem on bdev %s to path %s: %s\n", g_bdev_name, g_mountpoint, spdk_strerror(fserrno)); spdk_app_stop(0); @@ -69,7 +69,7 @@ fuse_run_cb(void *cb_arg, int fserrno) static void spdk_fuse_run(void *arg1) { - printf("Mounting filesystem on bdev %s to path %s...", + printf("Mounting filesystem on bdev %s to path %s...\n", g_bdev_name, g_mountpoint); fflush(stdout); diff --git a/test/blobfs/mkfs/Makefile b/test/blobfs/mkfs/Makefile index 51d8af5d4f4..42eebd9f1a2 100644 --- a/test/blobfs/mkfs/Makefile +++ b/test/blobfs/mkfs/Makefile @@ -40,7 +40,7 @@ APP = mkfs C_SRCS := mkfs.c SPDK_LIB_LIST = $(ALL_MODULES_LIST) -SPDK_LIB_LIST += event_bdev event_accel event_vmd +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) SPDK_LIB_LIST += bdev accel event thread util conf trace \ log jsonrpc json rpc sock notify blobfs_bdev diff --git a/test/blobfs/rocksdb/rocksdb.sh b/test/blobfs/rocksdb/rocksdb.sh index 40615690557..a11703358aa 100755 --- a/test/blobfs/rocksdb/rocksdb.sh +++ b/test/blobfs/rocksdb/rocksdb.sh @@ -4,6 +4,11 @@ testdir=$(readlink -f $(dirname $0)) rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh +sanitize_results() { + process_core + [[ -d $RESULTS_DIR ]] && chmod 644 "$RESULTS_DIR/"* +} + dump_db_bench_on_err() { # Fetch std dump of the last run_step that might have failed [[ -e $db_bench ]] || return 0 @@ -74,7 +79,7 @@ $rootdir/scripts/gen_nvme.sh > $ROCKSDB_CONF echo "[Global]" >> $ROCKSDB_CONF echo "TpointGroupMask 0x80" >> $ROCKSDB_CONF -trap 'dump_db_bench_on_err; run_bsdump || :; rm -f $ROCKSDB_CONF; exit 1' SIGINT SIGTERM EXIT +trap 'dump_db_bench_on_err; run_bsdump || :; rm -f $ROCKSDB_CONF; sanitize_results; exit 1' SIGINT SIGTERM EXIT if [ -z "$SKIP_MKFS" ]; then run_test "blobfs_mkfs" $rootdir/test/blobfs/mkfs/mkfs $ROCKSDB_CONF Nvme0n1 @@ -91,6 +96,15 @@ else DURATION=20 NUM_KEYS=20000000 fi +# Make sure that there's enough memory available for the mempool. Unfortunately, +# db_bench doesn't seem to allocate memory from all numa nodes since all of it +# comes exclusively from node0. With that in mind, try to allocate CACHE_SIZE +# + some_overhead (1G) of pages but only on node0 to make sure that we end up +# with the right amount not allowing setup.sh to split it by using the global +# nr_hugepages setting. Instead of bypassing it completely, we use it to also +# get the right size of hugepages. +HUGEMEM=$((CACHE_SIZE + 1024)) HUGENODE=0 \ + "$rootdir/scripts/setup.sh" cd $RESULTS_DIR cp $testdir/common_flags.txt insert_flags.txt @@ -153,3 +167,4 @@ trap - SIGINT SIGTERM EXIT run_bsdump rm -f $ROCKSDB_CONF +sanitize_results diff --git a/test/common/applications.sh b/test/common/applications.sh index 2800600cf80..041af293258 100644 --- a/test/common/applications.sh +++ b/test/common/applications.sh @@ -9,14 +9,16 @@ VHOST_FUZZ_APP=("$_test_app_dir/fuzz/vhost_fuzz/vhost_fuzz") ISCSI_APP=("$_app_dir/iscsi_tgt") NVMF_APP=("$_app_dir/nvmf_tgt") VHOST_APP=("$_app_dir/vhost") +DD_APP=("$_app_dir/spdk_dd") # Check if apps should execute under debug flags if [[ -e $_root/include/spdk/config.h ]]; then if [[ $(< "$_root/include/spdk/config.h") == *"#define SPDK_CONFIG_DEBUG"* ]] \ && ((SPDK_AUTOTEST_DEBUG_APPS)); then - VHOST_FUZZ_APP+=("--log-flags=all") - ISCSI_APP+=("--log-flags=all") - NVMF_APP+=("--log-flags=all") - VHOST_APP+=("--log-flags=all") + VHOST_FUZZ_APP+=("--logflag=all") + ISCSI_APP+=("--logflag=all") + NVMF_APP+=("--logflag=all") + VHOST_APP+=("--logflag=all") + DD_APP+=("--logflag=all") fi fi diff --git a/test/common/autotest_common.sh b/test/common/autotest_common.sh index 866d4b4934f..7a45f4a7951 100755 --- a/test/common/autotest_common.sh +++ b/test/common/autotest_common.sh @@ -100,14 +100,12 @@ export SPDK_TEST_LVOL export SPDK_TEST_JSON : ${SPDK_TEST_REDUCE=0} export SPDK_TEST_REDUCE -: ${SPDK_TEST_VPP=0} -export SPDK_TEST_VPP : ${SPDK_RUN_ASAN=0} export SPDK_RUN_ASAN : ${SPDK_RUN_UBSAN=0} export SPDK_RUN_UBSAN -: ${SPDK_RUN_INSTALLED_DPDK=0} -export SPDK_RUN_INSTALLED_DPDK +: ${SPDK_RUN_EXTERNAL_DPDK=""} +export SPDK_RUN_EXTERNAL_DPDK : ${SPDK_RUN_NON_ROOT=0} export SPDK_RUN_NON_ROOT : ${SPDK_TEST_CRYPTO=0} @@ -126,6 +124,11 @@ export SPDK_TEST_OPAL export SPDK_AUTOTEST_X : ${SPDK_TEST_RAID5=0} export SPDK_TEST_RAID5 +: ${SPDK_TEST_URING=0} +export SPDK_TEST_URING + +# Tell setup.sh to wait for block devices upon each reset +export PCI_BLOCK_SYNC_ON_RESET=yes # Export PYTHONPATH with addition of RPC framework. New scripts can be created # specific use cases for tests. @@ -188,19 +191,11 @@ fi if [ "$(uname -s)" = "Linux" ]; then MAKE="make" MAKEFLAGS=${MAKEFLAGS:--j$(nproc)} - DPDK_LINUX_DIR=/usr/share/dpdk/x86_64-default-linuxapp-gcc - if [ -d $DPDK_LINUX_DIR ] && [ $SPDK_RUN_INSTALLED_DPDK -eq 1 ]; then - WITH_DPDK_DIR=$DPDK_LINUX_DIR - fi # Override the default HUGEMEM in scripts/setup.sh to allocate 8GB in hugepages. export HUGEMEM=8192 elif [ "$(uname -s)" = "FreeBSD" ]; then MAKE="gmake" MAKEFLAGS=${MAKEFLAGS:--j$(sysctl -a | grep -E -i 'hw.ncpu' | awk '{print $2}')} - DPDK_FREEBSD_DIR=/usr/local/share/dpdk/x86_64-native-bsdapp-clang - if [ -d $DPDK_FREEBSD_DIR ] && [ $SPDK_RUN_INSTALLED_DPDK -eq 1 ]; then - WITH_DPDK_DIR=$DPDK_FREEBSD_DIR - fi # FreeBSD runs a much more limited set of tests, so keep the default 2GB. export HUGEMEM=2048 else @@ -209,12 +204,8 @@ else fi if [ -z "$output_dir" ]; then - if [ -z "$rootdir" ] || [ ! -d "$rootdir/../output" ]; then - output_dir=. - else - output_dir=$rootdir/../output - fi - export output_dir + mkdir -p "$rootdir/../output" + export output_dir="$rootdir/../output" fi TEST_MODE= @@ -240,12 +231,6 @@ if [[ -z $RPC_PIPE_PID ]] || ! kill -0 "$RPC_PIPE_PID" &> /dev/null; then # process, this will make rpc.py stop reading and exit gracefully fi -if [ $SPDK_TEST_VPP -eq 1 ]; then - VPP_PATH="/usr/local/src/vpp-19.04/build-root/install-vpp_debug-native/vpp/" - export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${VPP_PATH}/lib/ - export PATH=${PATH}:${VPP_PATH}/bin/ -fi - function set_test_storage() { [[ -v testdir ]] || return 0 @@ -256,8 +241,19 @@ function set_test_storage() { local source fs size avail mount use local storage_fallback storage_candidates + local storage_fallback_purge - storage_fallback=/tmp/spdk + shopt -s nullglob + storage_fallback_purge=("${TMPDIR:-/tmp}/spdk."??????) + shopt -u nullglob + + if ((${#storage_fallback_purge[@]} > 0)); then + printf '* Purging old temporary test storage (%s)\n' \ + "${storage_fallback_purge[*]}" >&2 + sudo rm -rf "${storage_fallback_purge[@]}" + fi + + storage_fallback=$(mktemp -udt spdk.XXXXXX) storage_candidates=( "$testdir" "$storage_fallback/tests/${testdir##*/}" @@ -374,10 +370,6 @@ function get_config_params() { config_params+=' --with-rbd' fi - if [ $SPDK_TEST_VPP -eq 1 ]; then - config_params+=" --with-vpp=${VPP_PATH}" - fi - # for options with no required dependencies, just test flags, set them here if [ $SPDK_TEST_CRYPTO -eq 1 ]; then config_params+=' --with-crypto' @@ -413,11 +405,13 @@ function get_config_params() { config_params+=' --with-raid5' fi - # By default, --with-dpdk is not set meaning the SPDK build will use the DPDK submodule. - # If a DPDK installation is found in a well-known location though, WITH_DPDK_DIR will be - # set which will override the default and use that DPDK installation instead. - if [ -n "$WITH_DPDK_DIR" ]; then - config_params+=" --with-dpdk=$WITH_DPDK_DIR" + # Check whether liburing library header exists + if [ -f /usr/include/liburing/io_uring.h ] && [ $SPDK_TEST_URING -eq 1 ]; then + config_params+=' --with-uring' + fi + + if [ -n "$SPDK_RUN_EXTERNAL_DPDK" ]; then + config_params+=" --with-dpdk=$SPDK_RUN_EXTERNAL_DPDK" fi echo "$config_params" @@ -480,11 +474,34 @@ function rpc_cmd_simple_data_json() { ((${#jq_out[@]} > 0)) || return 1 } -# invert error code of any command and also trigger ERR on 0 (unlike bash ! prefix) function NOT() { - if "$@"; then - return 1 + local es=0 + + "$@" || es=$? + + # Logic looks like so: + # - return false if command exit successfully + # - return false if command exit after receiving a core signal (FIXME: or any signal?) + # - return true if command exit with an error + + # This naively assumes that the process doesn't exit with > 128 on its own. + if ((es > 128)); then + es=$((es & ~128)) + case "$es" in + 3) ;& # SIGQUIT + 4) ;& # SIGILL + 6) ;& # SIGABRT + 8) ;& # SIGFPE + 9) ;& # SIGKILL + 11) es=0 ;; # SIGSEGV + *) es=1 ;; + esac + elif [[ -n $EXIT_STATUS ]] && ((es != EXIT_STATUS)); then + es=0 fi + + # invert error code of any command and also trigger ERR on 0 (unlike bash ! prefix) + ((!es == 0)) } function timing() { @@ -581,7 +598,7 @@ function process_core() { mv $core $output_dir chmod a+r $output_dir/$core ret=1 - done < <(find . -type f \( -name 'core\.?[0-9]*' -o -name '*.core' \) -print0) + done < <(find . -type f \( -name 'core.[0-9]*' -o -name 'core' -o -name '*.core' \) -print0) return $ret } @@ -811,6 +828,41 @@ function rbd_cleanup() { fi } +function nvme_cli_build() { + if [[ -z "${DEPENDENCY_DIR}" ]]; then + echo DEPENDENCY_DIR not defined! + exit 1 + fi + + spdk_nvme_cli="${DEPENDENCY_DIR}/nvme-cli" + + if [[ ! -d $spdk_nvme_cli ]]; then + echo "nvme-cli repository not found at $spdk_nvme_cli; skipping tests." + exit 1 + fi + + if ! grep -q "DEF_VER=v1.6" $spdk_nvme_cli/NVME-VERSION-GEN; then + echo "SPDK supports only \"spdk/nvme-cli\" project on \"spdk-1.6\" branch." + exit 1 + fi + + # Build against the version of SPDK under test + pushd $spdk_nvme_cli + + # Remove and recreate git index in case it became corrupted + if ! git clean -dfx; then + rm -f .git/index + git clean -dfx + git reset --hard + fi + + rm -f "$spdk_nvme_cli/spdk" + ln -sf "$rootdir" "$spdk_nvme_cli/spdk" + + make -j$(nproc) LDFLAGS="$(make -s -C $spdk_nvme_cli/spdk ldflags)" + popd +} + function _start_stub() { # Disable ASLR for multi-process testing. SPDK does support using DPDK multi-process, # but ASLR can still be unreliable in some cases. @@ -1134,17 +1186,24 @@ function autotest_cleanup() { fi fi rm -rf "$asan_suppression_file" + if [[ -n $old_core_pattern ]]; then + echo "$old_core_pattern" > /proc/sys/kernel/core_pattern + fi + if [[ -e /proc/$udevadm_pid/status ]]; then + kill "$udevadm_pid" || : + fi + revert_soft_roce } function freebsd_update_contigmem_mod() { if [ $(uname) = FreeBSD ]; then kldunload contigmem.ko || true - if [ -n "$WITH_DPDK_DIR" ]; then + if [ -n "$SPDK_RUN_EXTERNAL_DPDK" ]; then echo "Warning: SPDK only works on FreeBSD with patches that only exist in SPDK's dpdk submodule" - cp -f "$WITH_DPDK_DIR/kmod/contigmem.ko" /boot/modules/ - cp -f "$WITH_DPDK_DIR/kmod/contigmem.ko" /boot/kernel/ - cp -f "$WITH_DPDK_DIR/kmod/nic_uio.ko" /boot/modules/ - cp -f "$WITH_DPDK_DIR/kmod/nic_uio.ko" /boot/kernel/ + cp -f "$SPDK_RUN_EXTERNAL_DPDK/kmod/contigmem.ko" /boot/modules/ + cp -f "$SPDK_RUN_EXTERNAL_DPDK/kmod/contigmem.ko" /boot/kernel/ + cp -f "$SPDK_RUN_EXTERNAL_DPDK/kmod/nic_uio.ko" /boot/modules/ + cp -f "$SPDK_RUN_EXTERNAL_DPDK/kmod/nic_uio.ko" /boot/kernel/ else cp -f "$rootdir/dpdk/build/kmod/contigmem.ko" /boot/modules/ cp -f "$rootdir/dpdk/build/kmod/contigmem.ko" /boot/kernel/ @@ -1184,26 +1243,16 @@ function get_nvme_ctrlr_from_bdf() { printf '%s\n' "$(basename $bdf_sysfs_path)" } -function opal_revert_cleanup() { - $SPDK_BIN_DIR/spdk_tgt & - spdk_tgt_pid=$! - waitforlisten $spdk_tgt_pid - - # OPAL test only runs on the first NVMe device - # So we just revert the first one here - bdf=$($rootdir/scripts/gen_nvme.sh --json | jq -r '.config[].params | select(.name=="Nvme0").traddr') - $rootdir/scripts/rpc.py bdev_nvme_attach_controller -b "nvme0" -t "pcie" -a $bdf - # Ignore if this fails. - $rootdir/scripts/rpc.py bdev_nvme_opal_revert -b nvme0 -p test || true - - killprocess $spdk_tgt_pid -} - # Get BDF addresses of all NVMe drives currently attached to # uio-pci-generic or vfio-pci function get_nvme_bdfs() { xtrace_disable - jq -r .config[].params.traddr <<< $(scripts/gen_nvme.sh --json) + bdfs=$(jq -r .config[].params.traddr <<< $($rootdir/scripts/gen_nvme.sh --json)) + if [[ -z $bdfs ]]; then + echo "No devices to test on!" + exit 1 + fi + echo "$bdfs" xtrace_restore } @@ -1218,7 +1267,6 @@ function nvme_namespace_revert() { bdfs=$(get_nvme_bdfs) $rootdir/scripts/setup.sh reset - sleep 1 for bdf in $bdfs; do nvme_ctrlr=/dev/$(get_nvme_ctrlr_from_bdf ${bdf}) @@ -1232,7 +1280,9 @@ function nvme_namespace_revert() { if [[ "$oacs_ns_manage" -ne 0 ]]; then # This assumes every NVMe controller contains single namespace, - # encompassing Total NVM Capacity and formatted as 4k block size. + # encompassing Total NVM Capacity and formatted as 512 block size. + # 512 block size is needed for test/vhost/vhost_boot.sh to + # succesfully run. unvmcap=$(nvme id-ctrl ${nvme_ctrlr} | grep unvmcap | cut -d: -f2) if [[ "$unvmcap" -eq 0 ]]; then @@ -1240,7 +1290,7 @@ function nvme_namespace_revert() { continue fi tnvmcap=$(nvme id-ctrl ${nvme_ctrlr} | grep tnvmcap | cut -d: -f2) - blksize=4096 + blksize=512 size=$((tnvmcap / blksize)) @@ -1249,9 +1299,54 @@ function nvme_namespace_revert() { nvme create-ns ${nvme_ctrlr} -s ${size} -c ${size} -b ${blksize} nvme attach-ns ${nvme_ctrlr} -n 1 -c 0 nvme reset ${nvme_ctrlr} - waitforblk "${nvme_ctrlr}n1" + waitforfile "${nvme_ctrlr}n1" + fi + done +} + +# Get BDFs based on device ID, such as 0x0a54 +function get_nvme_bdfs_by_id() { + local bdfs=() + + for bdf in $(get_nvme_bdfs); do + device=$(cat /sys/bus/pci/devices/$bdf/device) || true + if [[ "$device" == "$1" ]]; then + bdfs+=($bdf) fi done + + printf '%s\n' "${bdfs[@]}" +} + +function opal_revert_cleanup() { + # The OPAL CI tests is only used for P4510 devices. + mapfile -t bdfs < <(get_nvme_bdfs_by_id 0x0a54) + if [[ -z ${bdfs[0]} ]]; then + return 0 + fi + + $SPDK_BIN_DIR/spdk_tgt & + spdk_tgt_pid=$! + waitforlisten $spdk_tgt_pid + + for bdf in "${bdfs[@]}"; do + $rootdir/scripts/rpc.py bdev_nvme_attach_controller -b "nvme0" -t "pcie" -a ${bdf} + # Ignore if this fails. + $rootdir/scripts/rpc.py bdev_nvme_opal_revert -b nvme0 -p test || true + done + + killprocess $spdk_tgt_pid +} + +function pap() { + while read -r file; do + cat <<- FILE + --- $file --- + $(<"$file") + --- $file --- + FILE + rm -f "$file" + done < <(find "$@" -type f | sort -u) } # Define temp storage for all the tests. Look for 2GB at minimum diff --git a/test/common/config/patch/vpp/fedora29-fix.patch b/test/common/config/patch/vpp/fedora29-fix.patch deleted file mode 100644 index 1baceeb1233..00000000000 --- a/test/common/config/patch/vpp/fedora29-fix.patch +++ /dev/null @@ -1,20 +0,0 @@ -diff --git a/Makefile b/Makefile -index 8c7f3523f..b6a79529c 100644 ---- a/Makefile -+++ b/Makefile -@@ -90,10 +90,12 @@ RPM_DEPENDS += libuuid-devel - RPM_DEPENDS += mbedtls-devel - - ifeq ($(OS_ID),fedora) -- RPM_DEPENDS += dnf-utils -+ ifeq ("$(wildcard /usr/bin/package-cleanup)","") -+ RPM_DEPENDS += dnf-utils -+ endif - RPM_DEPENDS += subunit subunit-devel -- RPM_DEPENDS += compat-openssl10-devel -- RPM_DEPENDS += python2-devel python34-ply -+ RPM_DEPENDS += openssl-devel -+ RPM_DEPENDS += python2-devel - RPM_DEPENDS += python2-virtualenv - RPM_DEPENDS += cmake - RPM_DEPENDS_GROUPS = 'C Development Tools and Libraries' diff --git a/test/common/config/patch/vpp/fedora30-fix.patch b/test/common/config/patch/vpp/fedora30-fix.patch deleted file mode 100644 index 82900f1f74d..00000000000 --- a/test/common/config/patch/vpp/fedora30-fix.patch +++ /dev/null @@ -1,46 +0,0 @@ -diff --git a/Makefile b/Makefile -index 8c7f3523f..20814ee8d 100644 ---- a/Makefile -+++ b/Makefile -@@ -92,8 +92,8 @@ RPM_DEPENDS += mbedtls-devel - ifeq ($(OS_ID),fedora) - RPM_DEPENDS += dnf-utils - RPM_DEPENDS += subunit subunit-devel -- RPM_DEPENDS += compat-openssl10-devel -- RPM_DEPENDS += python2-devel python34-ply -+ RPM_DEPENDS += openssl-devel -+ RPM_DEPENDS += python2-devel - RPM_DEPENDS += python2-virtualenv - RPM_DEPENDS += cmake - RPM_DEPENDS_GROUPS = 'C Development Tools and Libraries' -diff --git a/build/external/packages/dpdk.mk b/build/external/packages/dpdk.mk -index a551151bb..b0258017a 100644 ---- a/build/external/packages/dpdk.mk -+++ b/build/external/packages/dpdk.mk -@@ -147,7 +147,7 @@ endif - endif - endif - --DPDK_EXTRA_CFLAGS += -L$(I)/lib -I$(I)/include -+DPDK_EXTRA_CFLAGS += -L$(I)/lib -I$(I)/include -Wno-address-of-packed-member - - # assemble DPDK make arguments - DPDK_MAKE_ARGS := -C $(DPDK_SOURCE) -j $(JOBS) \ -diff --git a/src/plugins/crypto_ia32/CMakeLists.txt b/src/plugins/crypto_ia32/CMakeLists.txt -index a100cdbb6..92e408098 100644 ---- a/src/plugins/crypto_ia32/CMakeLists.txt -+++ b/src/plugins/crypto_ia32/CMakeLists.txt -@@ -22,3 +22,4 @@ add_vpp_plugin(crypto_ia32 - ) - - target_compile_options(crypto_ia32_plugin PRIVATE "-march=silvermont") -+target_compile_options(crypto_ia32_plugin PRIVATE "-maes") -diff --git a/src/plugins/crypto_ipsecmb/CMakeLists.txt b/src/plugins/crypto_ipsecmb/CMakeLists.txt -index 0d08032c0..6a7eb148f 100644 ---- a/src/plugins/crypto_ipsecmb/CMakeLists.txt -+++ b/src/plugins/crypto_ipsecmb/CMakeLists.txt -@@ -39,3 +39,4 @@ else() - endif() - - target_compile_options(crypto_ipsecmb_plugin PRIVATE "-march=silvermont") -+target_compile_options(crypto_ipsecmb_plugin PRIVATE "-maes") diff --git a/test/common/config/patch/vpp/fedora31-fix.patch b/test/common/config/patch/vpp/fedora31-fix.patch deleted file mode 100644 index 82900f1f74d..00000000000 --- a/test/common/config/patch/vpp/fedora31-fix.patch +++ /dev/null @@ -1,46 +0,0 @@ -diff --git a/Makefile b/Makefile -index 8c7f3523f..20814ee8d 100644 ---- a/Makefile -+++ b/Makefile -@@ -92,8 +92,8 @@ RPM_DEPENDS += mbedtls-devel - ifeq ($(OS_ID),fedora) - RPM_DEPENDS += dnf-utils - RPM_DEPENDS += subunit subunit-devel -- RPM_DEPENDS += compat-openssl10-devel -- RPM_DEPENDS += python2-devel python34-ply -+ RPM_DEPENDS += openssl-devel -+ RPM_DEPENDS += python2-devel - RPM_DEPENDS += python2-virtualenv - RPM_DEPENDS += cmake - RPM_DEPENDS_GROUPS = 'C Development Tools and Libraries' -diff --git a/build/external/packages/dpdk.mk b/build/external/packages/dpdk.mk -index a551151bb..b0258017a 100644 ---- a/build/external/packages/dpdk.mk -+++ b/build/external/packages/dpdk.mk -@@ -147,7 +147,7 @@ endif - endif - endif - --DPDK_EXTRA_CFLAGS += -L$(I)/lib -I$(I)/include -+DPDK_EXTRA_CFLAGS += -L$(I)/lib -I$(I)/include -Wno-address-of-packed-member - - # assemble DPDK make arguments - DPDK_MAKE_ARGS := -C $(DPDK_SOURCE) -j $(JOBS) \ -diff --git a/src/plugins/crypto_ia32/CMakeLists.txt b/src/plugins/crypto_ia32/CMakeLists.txt -index a100cdbb6..92e408098 100644 ---- a/src/plugins/crypto_ia32/CMakeLists.txt -+++ b/src/plugins/crypto_ia32/CMakeLists.txt -@@ -22,3 +22,4 @@ add_vpp_plugin(crypto_ia32 - ) - - target_compile_options(crypto_ia32_plugin PRIVATE "-march=silvermont") -+target_compile_options(crypto_ia32_plugin PRIVATE "-maes") -diff --git a/src/plugins/crypto_ipsecmb/CMakeLists.txt b/src/plugins/crypto_ipsecmb/CMakeLists.txt -index 0d08032c0..6a7eb148f 100644 ---- a/src/plugins/crypto_ipsecmb/CMakeLists.txt -+++ b/src/plugins/crypto_ipsecmb/CMakeLists.txt -@@ -39,3 +39,4 @@ else() - endif() - - target_compile_options(crypto_ipsecmb_plugin PRIVATE "-march=silvermont") -+target_compile_options(crypto_ipsecmb_plugin PRIVATE "-maes") diff --git a/test/common/config/pkgdep/apt-get b/test/common/config/pkgdep/apt-get index a1630620d4d..2b2a8f4e524 100644 --- a/test/common/config/pkgdep/apt-get +++ b/test/common/config/pkgdep/apt-get @@ -36,10 +36,6 @@ pre_install() { fi if ! install rdma-core; then echo "Package rdma-core is avaliable at Ubuntu 18 [universe] repositorium" >&2 - install rdmacm-utils - install ibverbs-utils - else - LIBRXE_INSTALL=false fi if ! install libpmempool1; then echo "Package libpmempool1 is available at Ubuntu 18 [universe] repositorium" >&2 @@ -69,6 +65,7 @@ packages=( ceph gdb fio + libaio-dev librbd-dev linux-headers-generic libgflags-dev @@ -93,8 +90,28 @@ packages=( bc smartmontools wget + xfsprogs + ibverbs-utils + rdmacm-utils ) +install_vagrant_dependencies() { + local vagrant_packages + vagrant_packages=( + qemu + libvirt-bin + ebtables + dnsmasq-base + libxslt-dev + libxml2-dev + libvirt-dev + zlib1g-dev + ruby-dev + ) + + install "${vagrant_packages[@]}" +} + if [[ $OSID != ubuntu ]]; then echo "Located apt-get package manager, but it was tested for Ubuntu only" fi diff --git a/test/common/config/pkgdep/dnf b/test/common/config/pkgdep/dnf index b009f106efd..3e7ce03efa9 100644 --- a/test/common/config/pkgdep/dnf +++ b/test/common/config/pkgdep/dnf @@ -17,6 +17,7 @@ packages=( ceph gdb fio + libaio-devel librbd-devel kernel-devel gflags-devel @@ -56,16 +57,10 @@ packages=( systemd-devel smartmontools wget + xfsprogs ) -pre_install() { - if [[ $INTSALL_TSOCKS == true ]]; then - # currently, tsocks package is retired in fedora 31, so don't exit in case - # installation failed - # FIXME: Review when fedora starts to successfully build this package again. - install tsocks || echo "Installation of the tsocks package failed, proxy may not be available" - fi -} +pre_install() { :; } if [[ $OSID != fedora ]]; then echo "Located dnf package manager, but it was tested for Fedora only" diff --git a/test/common/config/pkgdep/git b/test/common/config/pkgdep/git new file mode 100644 index 00000000000..7090a576f2e --- /dev/null +++ b/test/common/config/pkgdep/git @@ -0,0 +1,375 @@ +function install_spdk() { + mkdir -p "$GIT_REPOS/spdk_repo/output" || echo "Can not create spdk_repo/output directory." + + if [[ -d $GIT_REPOS/spdk_repo/spdk ]]; then + echo "spdk source already present, not cloning" + else + git -C "$GIT_REPOS/spdk_repo" clone "${GIT_REPO_SPDK}" + fi + git -C "$GIT_REPOS/spdk_repo/spdk" config submodule.dpdk.url "${GIT_REPO_DPDK}" + git -C "$GIT_REPOS/spdk_repo/spdk" config submodule.intel-ipsec-mb.url "${GIT_REPO_INTEL_IPSEC_MB}" + git -C "$GIT_REPOS/spdk_repo/spdk" submodule update --init --recursive +} + +function install_refspdk() { + local last_release + local output_dir + local config_params + local rootdir + + # Create a reference SPDK build for ABI tests + git -C "$GIT_REPOS/spdk_repo/spdk" fetch --tags + last_release=$(git -C "$GIT_REPOS/spdk_repo/spdk" tag | sort --version-sort | grep -v rc | tail -n1) + output_dir="$GIT_REPOS/spdk_$(tr . _ < <(tr -d '[:alpha:]' <<< $last_release))" + + if [[ ! -d $output_dir ]]; then + cp -r "$GIT_REPOS/spdk_repo/spdk" "$output_dir" + fi + + git -C "$output_dir" checkout "$last_release" + git -C "$output_dir" submodule update --init + + cat > $HOME/autorun-spdk.conf <<- EOF + SPDK_BUILD_SHARED_OBJECT=1 + SPDK_TEST_AUTOBUILD=1 + SPDK_TEST_UNITTEST=1 + SPDK_TEST_BLOCKDEV=1 + SPDK_TEST_PMDK=1 + SPDK_TEST_ISAL=1 + SPDK_TEST_REDUCE=1 + SPDK_TEST_CRYPTO=1 + SPDK_TEST_FTL=1 + SPDK_TEST_OCF=1 + SPDK_TEST_RAID5=1 + SPDK_TEST_RBD=1 + SPDK_RUN_ASAN=1 + SPDK_RUN_UBSAN=1 + EOF + + mkdir -p $HOME/output + + ( + rootdir="$output_dir" + source $HOME/autorun-spdk.conf + source $output_dir/test/common/autotest_common.sh + + # Prepare separate, fixed, cmdline for the FreeBSD, Issue #1397. + if [[ $OSID == freebsd ]]; then + config_params="--enable-debug --enable-werror" + config_params+=" --with-idxd --with-fio=/usr/src/fio" + config_params+=" --disable-unit-tests --without-isal" + MAKE=gmake + else + config_params="$(get_config_params)" + fi + $output_dir/configure $(echo $config_params | sed 's/--enable-coverage//g') + if [[ $OSID != freebsd ]]; then + $MAKE -C $output_dir $MAKEFLAGS include/spdk/config.h + CONFIG_OCF_PATH="$output_dir/ocf" $MAKE -C $output_dir/lib/env_ocf $MAKEFLAGS exportlib O=$output_dir/build/ocf.a + $output_dir/configure $config_params --with-ocf=$output_dir/build/ocf.a --with-shared + fi + $MAKE -C $output_dir $MAKEFLAGS + ) +} + +function install_qat() { + # Disect the kernel version into maj, min, release and local version + local kernel_maj kernel_min kernel_rel kernel_loc + local kernel_ver + + IFS=".-" read -r kernel_{maj,min,rel,loc} < /proc/sys/kernel/osrelease + kernel_ver=$((kernel_maj << 16 | kernel_min << 8 | kernel_rel)) + + if [[ -e /sys/module/qat_c62x ]]; then + sudo modprobe -r qat_c62x || : + fi + if [[ -d $GIT_REPOS/QAT ]]; then + sudo rm -rf "$GIT_REPOS/QAT" + fi + + mkdir "$GIT_REPOS/QAT" + + tar -C "$GIT_REPOS/QAT" -xzof - < <(wget -O- "$DRIVER_LOCATION_QAT") + + # Patch use of hidden types in kernels >= 5.6.3. See .patch for details + if ((kernel_ver >= 0x050603)); then + # Patch only the driver version that was tested + [[ ${DRIVER_LOCATION_QAT##*/} == qat1.7.l.4.9.0-00008.tar.gz ]] && patch --dir="$GIT_REPOS/QAT" -p1 + fi < "$rootdir/test/common/config/pkgdep/patches/qat/0001-timespec.patch" + + # Patch name of the pci_aer function which was renamed in kernels >= 5.7.1. See .patch for details + if ((kernel_ver >= 0x050701)); then + # Patch only the driver version that was tested + [[ ${DRIVER_LOCATION_QAT##*/} == qat1.7.l.4.9.0-00008.tar.gz ]] && patch --dir="$GIT_REPOS/QAT" -p1 + fi < "$rootdir/test/common/config/pkgdep/patches/qat/0001-pci_aer.patch" + + # Patch use of cryptohash.h which was removed in favor of crypto/sha.h in kernels >= 5.8. See .patch for details + if ((kernel_ver >= 0x050800)); then + # Patch only the driver version that was tested + [[ ${DRIVER_LOCATION_QAT##*/} == qat1.7.l.4.9.0-00008.tar.gz ]] && patch --dir="$GIT_REPOS/QAT" -p1 + fi < "$rootdir/test/common/config/pkgdep/patches/qat/0001-cryptohash.patch" + + (cd "$GIT_REPOS/QAT" && sudo ./configure --enable-icp-sriov=host && sudo make install) + + if ! sudo service qat_service start; then + echo "failed to start the qat service. Something may be wrong with your device or package." + fi +} + +function install_rocksdb() { + # Rocksdb is installed for use with the blobfs tests. + if [ ! -d /usr/src/rocksdb ]; then + git clone "${GIT_REPO_ROCKSDB}" "$GIT_REPOS/rocksdb" + git -C "$GIT_REPOS/rocksdb" checkout spdk-v5.6.1 + sudo mv "$GIT_REPOS/rocksdb" /usr/src/ + else + sudo git -C /usr/src/rocksdb checkout spdk-v5.6.1 + echo "rocksdb already in /usr/src. Not checking out again" + fi +} + +function install_fio() { + # This version of fio is installed in /usr/src/fio to enable + # building the spdk fio plugin. + local fio_version="fio-3.19" + + if [ ! -d /usr/src/fio ]; then + if [ ! -d fio ]; then + git clone "${GIT_REPO_FIO}" "$GIT_REPOS/fio" + sudo mv "$GIT_REPOS/fio" /usr/src/ + else + sudo mv "$GIT_REPOS/fio" /usr/src/ + fi + ( + git -C /usr/src/fio checkout master \ + && git -C /usr/src/fio pull \ + && git -C /usr/src/fio checkout $fio_version \ + && if [ $OSID == 'freebsd' ]; then + gmake -C /usr/src/fio -j${jobs} \ + && sudo gmake -C /usr/src/fio install + else + make -C /usr/src/fio -j${jobs} \ + && sudo make -C /usr/src/fio install + fi + ) + else + echo "fio already in /usr/src/fio. Not installing" + fi +} + +function install_flamegraph() { + # Flamegraph is used when printing out timing graphs for the tests. + if [ ! -d /usr/local/FlameGraph ]; then + git clone "${GIT_REPO_FLAMEGRAPH}" "$GIT_REPOS/FlameGraph" + mkdir -p /usr/local + sudo mv "$GIT_REPOS/FlameGraph" /usr/local/FlameGraph + else + echo "flamegraph already installed. Skipping" + fi +} + +function install_qemu() { + # Two versions of QEMU are used in the tests. + # Stock QEMU is used for vhost. A special fork + # is used to test OCSSDs. Install both. + + # Forked QEMU + SPDK_QEMU_BRANCH=spdk-5.0.0 + mkdir -p "$GIT_REPOS/qemu" + if [[ ! -d $GIT_REPOS/qemu/$SPDK_QEMU_BRANCH ]]; then + git clone "${GIT_REPO_QEMU}" -b "$SPDK_QEMU_BRANCH" "$GIT_REPOS/qemu/$SPDK_QEMU_BRANCH" + else + echo "qemu already checked out. Skipping" + fi + + declare -a opt_params=("--prefix=/usr/local/qemu/$SPDK_QEMU_BRANCH") + if ((gcc_version >= 9)); then + # GCC 9 fails to compile Qemu due to some old warnings which were not detected by older versions. + opt_params+=("--extra-cflags=-Wno-error=stringop-truncation -Wno-error=deprecated-declarations -Wno-error=incompatible-pointer-types -Wno-error=format-truncation") + opt_params+=("--disable-glusterfs") + fi + + # Most tsocks proxies rely on a configuration file in /etc/tsocks.conf. + # If using tsocks, please make sure to complete this config before trying to build qemu. + if [[ $INSTALL_TSOCKS == true && $NO_TSOCKS != true ]]; then + if hash tsocks 2> /dev/null; then + opt_params+=("--with-git='tsocks git'") + fi + fi + + sed -i s@git://git.qemu.org/@https://github.com/qemu/@g "$GIT_REPOS/qemu/$SPDK_QEMU_BRANCH/.gitmodules" + sed -i s@git://git.qemu.org/@https://github.com/qemu/@g "$GIT_REPOS/qemu/$SPDK_QEMU_BRANCH/.git/config" + sed -i s@git://git.qemu-project.org/@https://github.com/qemu/@g "$GIT_REPOS/qemu/$SPDK_QEMU_BRANCH/.gitmodules" + sed -i s@git://git.qemu-project.org/@https://github.com/qemu/@g "$GIT_REPOS/qemu/$SPDK_QEMU_BRANCH/.git/config" + # The qemu configure script places several output files in the CWD. + (cd "$GIT_REPOS/qemu/$SPDK_QEMU_BRANCH" && ./configure "${opt_params[@]}" --target-list="x86_64-softmmu" --enable-kvm --enable-linux-aio --enable-numa) + + make -C "$GIT_REPOS/qemu/$SPDK_QEMU_BRANCH" -j${jobs} + sudo make -C "$GIT_REPOS/qemu/$SPDK_QEMU_BRANCH" install +} + +function install_nvmecli() { + SPDK_NVME_CLI_BRANCH=spdk-1.6 + if [[ ! -d $GIT_REPOS/nvme-cli ]]; then + git clone "${GIT_REPO_SPDK_NVME_CLI}" -b "$SPDK_NVME_CLI_BRANCH" "$GIT_REPOS/nvme-cli" + else + echo "nvme-cli already checked out. Skipping" + fi + if [ ! -d "/usr/local/src/nvme-cli" ]; then + # Changes required for SPDK are already merged on top of + # nvme-cli, however not released yet. + # Support for SPDK should be released in nvme-cli >1.11.1 + if [[ ! -d $GIT_REPOS/nvme-cli-cuse ]]; then + git clone "https://github.com/linux-nvme/nvme-cli.git" "$GIT_REPOS/nvme-cli-cuse" + fi + git -C "$GIT_REPOS/nvme-cli-cuse" checkout "e770466615096a6d41f038a28819b00bc3078e1d" + make -C "$GIT_REPOS/nvme-cli-cuse" + sudo mv "$GIT_REPOS/nvme-cli-cuse" /usr/local/src/nvme-cli + fi +} + +function install_libiscsi() { + # We currently don't make any changes to the libiscsi repository for our tests, but it is possible that we will need + # to later. Cloning from git is just future proofing the machines. + if [[ ! -d $GIT_REPOS/libiscsi ]]; then + git clone "${GIT_REPO_LIBISCSI}" "$GIT_REPOS/libiscsi" + else + echo "libiscsi already checked out. Skipping" + fi + (cd "$GIT_REPOS/libiscsi" && ./autogen.sh && ./configure --prefix=/usr/local/libiscsi) + make -C "$GIT_REPOS/libiscsi" -j${jobs} + sudo make -C "$GIT_REPOS/libiscsi" install +} + +function install_git() { + install zlib-devel curl-devel + tar -C "$GIT_REPOS" -xzof <(wget -qO- "$GIT_REPO_GIT") + (cd "$GIT_REPOS/git-$GIT_VERSION" \ + && make configure \ + && ./configure --prefix=/usr/local/git \ + && sudo make -j${jobs} install) + sudo sh -c "echo 'export PATH=/usr/local/git/bin:$PATH' >> /etc/bashrc" + export "PATH=/usr/local/git/bin:$PATH" +} + +function install_extra_pkgs() { + if [[ $INSTALL_QAT == true ]]; then + install libudev-devel || install libudev-dev + fi + + if [[ $INSTALL_QEMU == true ]]; then + install qemu-system-x86 qemu-img \ + || install qemu-system-x86 qemu-utils \ + || install qemu + fi +} + +function install_vagrant() { + local vagrant_version="2.2.7" + local vagrant_installer="vagrant_${vagrant_version}_x86_64.deb" + local vagrant_plugins=(vagrant-libvirt vagrant-sshfs vagrant-cachier vagrant-proxyconf) + + if [[ $OSID != ubuntu ]]; then + error "Currently only ubuntu is supported" + fi + + # Install vagrant and it's plugins dependencies + # function should be defined in pkgdep/$package_manager file + install_vagrant_dependencies + + # Download and install vagrant + if hash vagrant &> /dev/null; then + echo "Vagrant is already installed" + else + wget "https://releases.hashicorp.com/vagrant/${vagrant_version}/${vagrant_installer}" + sudo dpkg -i "${vagrant_installer}" + fi + vagrant --version + + # Install vagrant plugins + local vagrant_plugin_list + vagrant_plugin_list=$(vagrant plugin list) + + local plugin + for plugin in "${vagrant_plugins[@]}"; do + if grep -Fq "$plugin" <<< "$vagrant_plugin_list"; then + echo "$plugin already installed" + else + vagrant plugin install "$plugin" + fi + done +} + +GIT_VERSION=2.25.1 +: ${GIT_REPO_SPDK=https://github.com/spdk/spdk.git} +export GIT_REPO_SPDK +: ${GIT_REPO_DPDK=https://github.com/spdk/dpdk.git} +export GIT_REPO_DPDK +: ${GIT_REPO_ROCKSDB=https://review.spdk.io/spdk/rocksdb} +export GIT_REPO_ROCKSDB +: ${GIT_REPO_FIO=http://git.kernel.dk/fio.git} +export GIT_REPO_FIO +: ${GIT_REPO_FLAMEGRAPH=https://github.com/brendangregg/FlameGraph.git} +export GIT_REPO_FLAMEGRAPH +: ${GIT_REPO_QEMU=https://github.com/spdk/qemu} +export GIT_REPO_QEMU +: ${GIT_REPO_LIBISCSI=https://github.com/sahlberg/libiscsi} +export GIT_REPO_LIBISCSI +: ${GIT_REPO_SPDK_NVME_CLI=https://github.com/spdk/nvme-cli} +export GIT_REPO_SPDK_NVME_CLI +: ${GIT_REPO_INTEL_IPSEC_MB=https://github.com/spdk/intel-ipsec-mb.git} +export GIT_REPO_INTEL_IPSEC_MB +: ${DRIVER_LOCATION_QAT=https://01.org/sites/default/files/downloads//qat1.7.l.4.9.0-00008.tar.gz} +export DRIVER_LOCATION_QAT +: ${GIT_REPO_GIT=https://github.com/git/git/archive/v${GIT_VERSION}.tar.gz} +export GIT_REPO_GIT +GIT_REPOS=${GIT_REPOS:-$HOME} + +gcc_version=$(gcc -dumpversion) gcc_version=${gcc_version%%.*} +if [[ $ID == centos ]] && (( VERSION_ID == 7 )); then + # install proper version of the git first + install_git +fi + +IFS="," read -ra conf_env <<< "$CONF" +for conf in "${conf_env[@]}"; do + export "INSTALL_${conf^^}=true" +done +sources=(install_refspdk) + +if [[ $OS == FreeBSD ]]; then + jobs=$(($(sysctl -n hw.ncpu) * 2)) +else + jobs=$(($(nproc) * 2)) + sources+=( + install_libiscsi + install_nvmecli + install_qat + install_rocksdb + install_flamegraph + install_qemu + ) +fi +sources+=(install_fio) +sources+=(install_vagrant) + +sudo mkdir -p /usr/{,local}/src +sudo mkdir -p "$GIT_REPOS" + +install_extra_pkgs + +if [[ $INSTALL_REFSPDK == true ]]; then + # Serialize builds as refspdk depends on spdk + install_spdk + install_refspdk +else + sources+=(install_spdk) +fi + +for source in "${sources[@]}"; do + source_conf=${source^^} + if [[ ${!source_conf} == true ]]; then + "$source" & + fi +done +wait diff --git a/test/common/config/pkgdep/os/centos b/test/common/config/pkgdep/os/centos new file mode 120000 index 00000000000..476fbc01232 --- /dev/null +++ b/test/common/config/pkgdep/os/centos @@ -0,0 +1 @@ +rhel \ No newline at end of file diff --git a/test/common/config/pkgdep/os/fedora b/test/common/config/pkgdep/os/fedora new file mode 120000 index 00000000000..476fbc01232 --- /dev/null +++ b/test/common/config/pkgdep/os/fedora @@ -0,0 +1 @@ +rhel \ No newline at end of file diff --git a/test/common/config/pkgdep/os/rhel b/test/common/config/pkgdep/os/rhel new file mode 100644 index 00000000000..f1901f6c883 --- /dev/null +++ b/test/common/config/pkgdep/os/rhel @@ -0,0 +1,17 @@ +pre_install() { + if [[ $INTSALL_TSOCKS == true ]]; then + # currently, tsocks package is retired in fedora 31, so don't exit in case + # installation failed + # FIXME: Review when fedora starts to successfully build this package again. + install tsocks || echo "Installation of the tsocks package failed, proxy may not be available" + fi + if [[ $ID == centos ]] && (( VERSION_ID == 8 )); then + sudo "$package_manager" update -y --refresh + fi + + install nbd || { + install wget + wget -O nbd.rpm https://download-ib01.fedoraproject.org/pub/epel/7/x86_64/Packages/n/nbd-3.14-2.el7.x86_64.rpm + install nbd.rpm + } +} diff --git a/test/common/config/pkgdep/pacman b/test/common/config/pkgdep/pacman index 43d3db2f538..d6d3d0c53b4 100644 --- a/test/common/config/pkgdep/pacman +++ b/test/common/config/pkgdep/pacman @@ -52,6 +52,7 @@ packages=( smartmontools parted wget + xfsprogs ) # TODO: diff --git a/test/common/config/pkgdep/patches/qat/0001-cryptohash.patch b/test/common/config/pkgdep/patches/qat/0001-cryptohash.patch new file mode 100644 index 00000000000..2d87c8f3625 --- /dev/null +++ b/test/common/config/pkgdep/patches/qat/0001-cryptohash.patch @@ -0,0 +1,17 @@ +cryptohash.h was dropped and merged with crypto/sha.sh in 5.8 kernel. Details in: +https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=228c4f265c6eb60eaa4ed0edb3bf7c113173576c + +--- +diff --git a/quickassist/utilities/osal/src/linux/kernel_space/OsalCryptoInterface.c b/quickassist/utilities/osal/src/linux/kernel_space/OsalCryptoInterface.c +index 4c389da..e602377 100644 +--- a/quickassist/utilities/osal/src/linux/kernel_space/OsalCryptoInterface.c ++++ b/quickassist/utilities/osal/src/linux/kernel_space/OsalCryptoInterface.c +@@ -66,7 +66,7 @@ + + #include "Osal.h" + #include +-#include ++#include + #include + #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)) + #include diff --git a/test/common/config/pkgdep/patches/qat/0001-pci_aer.patch b/test/common/config/pkgdep/patches/qat/0001-pci_aer.patch new file mode 100644 index 00000000000..7516ac4fee7 --- /dev/null +++ b/test/common/config/pkgdep/patches/qat/0001-pci_aer.patch @@ -0,0 +1,20 @@ +In kernel 5.7 the pci_cleanup_aer_uncorrect_error_status() function was +renamed with the following commit: + +git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=894020fdd88c1e9a74c60b67c0f19f1c7696ba2f + +This simply updates the function call with the proper name (pci_aer_clear_nonfatal_status()). + +--- +diff --git a/quickassist/qat/drivers/crypto/qat/qat_common/adf_aer.c b/quickassist/qat/drivers/crypto/qat/qat_common/adf_aer.c +index a6ce6df..545bb79 100644 +--- a/quickassist/qat/drivers/crypto/qat/qat_common/adf_aer.c ++++ b/quickassist/qat/drivers/crypto/qat/qat_common/adf_aer.c +@@ -304,7 +304,7 @@ static pci_ers_result_t adf_slot_reset(struct pci_dev *pdev) + pr_err("QAT: Can't find acceleration device\n"); + return PCI_ERS_RESULT_DISCONNECT; + } +- pci_cleanup_aer_uncorrect_error_status(pdev); ++ pci_aer_clear_nonfatal_status(pdev); + if (adf_dev_aer_schedule_reset(accel_dev, ADF_DEV_RESET_SYNC)) + return PCI_ERS_RESULT_DISCONNECT; diff --git a/test/common/config/pkgdep/patches/qat/0001-timespec.patch b/test/common/config/pkgdep/patches/qat/0001-timespec.patch new file mode 100644 index 00000000000..04fb053e1f8 --- /dev/null +++ b/test/common/config/pkgdep/patches/qat/0001-timespec.patch @@ -0,0 +1,35 @@ +This patch attempts to expose timespec and getnstimeofday which were +explicitly hidden in the 5.6 kernel with the introduction of the +following commits: + +git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c766d1472c70d25ad475cf56042af1652e792b23 +git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=412c53a680a97cb1ae2c0ab60230e193bee86387 + +Code received from users@dpdk.org, issue tracked under QATE-59888. + +--- +diff --git a/quickassist/lookaside/access_layer/src/sample_code/performance/framework/linux/kernel_space/cpa_sample_code_utils.c b/quickassist/lookaside/access_layer/src/sample_code/performance/framework/linux/kernel_space/cpa_sample_code_utils.c +index 4639834..523e376 100644 +--- a/quickassist/lookaside/access_layer/src/sample_code/performance/framework/linux/kernel_space/cpa_sample_code_utils.c ++++ b/quickassist/lookaside/access_layer/src/sample_code/performance/framework/linux/kernel_space/cpa_sample_code_utils.c +@@ -107,6 +107,8 @@ atomic_t arrived; + extern struct device perf_device; + #endif + ++#define timespec timespec64 ++#define getnstimeofday ktime_get_real_ts64 + + /* Define a number for timeout */ + #define SAMPLE_CODE_MAX_LONG (0x7FFFFFFF) +diff --git a/quickassist/qat/compat/qat_compat.h b/quickassist/qat/compat/qat_compat.h +index 2a02eaf..3515092 100644 +--- a/quickassist/qat/compat/qat_compat.h ++++ b/quickassist/qat/compat/qat_compat.h +@@ -466,4 +466,7 @@ static inline void pci_ignore_hotplug(struct pci_dev *dev) + #if (RHEL_RELEASE_CODE && RHEL_RELEASE_VERSION(7, 3) <= RHEL_RELEASE_CODE) + #define QAT_KPT_CAP_DISCOVERY + #endif ++ ++#define timespec timespec64 ++#define getnstimeofday ktime_get_real_ts64 + #endif /* _QAT_COMPAT_H_ */ diff --git a/test/common/config/pkgdep/pkg b/test/common/config/pkgdep/pkg index 3f3f41725d4..170c7aaf6ff 100644 --- a/test/common/config/pkgdep/pkg +++ b/test/common/config/pkgdep/pkg @@ -11,6 +11,7 @@ install() { } packages=( + etc_os-release pciutils jq gdb diff --git a/test/common/config/pkgdep/swupd b/test/common/config/pkgdep/swupd new file mode 100644 index 00000000000..c1d2a8a6b39 --- /dev/null +++ b/test/common/config/pkgdep/swupd @@ -0,0 +1,21 @@ +package_manager=swupd + +upgrade() { + sudo "$package_manager" update -y +} + +install() { + (($#)) || return 0 + + sudo "$package_manager" bundle-add -y "$@" +} + +packages=( + jq +) + +pre_install() { + if [[ $INTSALL_TSOCKS == true ]]; then + install tsocks || echo "Installation of the tsocks package failed, proxy may not be available" + fi +} diff --git a/test/common/config/pkgdep/yum b/test/common/config/pkgdep/yum index 32e89bc153b..a6e3076f0d4 100644 --- a/test/common/config/pkgdep/yum +++ b/test/common/config/pkgdep/yum @@ -17,6 +17,7 @@ packages=( nvme-cli gdb fio + libaio-devel librbd-devel kernel-devel gflags-devel @@ -53,15 +54,8 @@ packages=( systemd-devel python3 wget + btrfs-progs + xfsprogs ) -pre_install() { - if [[ $ID == centos ]] && (( VERSION_ID == 8 )); then - "$package_manager" update -y --refresh - fi - - install nbd || { - wget -O nbd.rpm https://download-ib01.fedoraproject.org/pub/epel/7/x86_64/Packages/n/nbd-3.14-2.el7.x86_64.rpm - install nbd.rpm - } -} +pre_install() { :; } diff --git a/test/common/config/vm_setup.conf b/test/common/config/vm_setup.conf index a8e58d82a0e..29cee14acb5 100644 --- a/test/common/config/vm_setup.conf +++ b/test/common/config/vm_setup.conf @@ -6,7 +6,6 @@ GIT_REPO_ROCKSDB=https://review.gerrithub.io/spdk/rocksdb GIT_REPO_FIO=http://git.kernel.dk/fio.git GIT_REPO_FLAMEGRAPH=https://github.com/brendangregg/FlameGraph.git GIT_REPO_QEMU=https://github.com/spdk/qemu -GIT_REPO_VPP=https://gerrit.fd.io/r/vpp GIT_REPO_LIBISCSI=https://github.com/sahlberg/libiscsi GIT_REPO_SPDK_NVME_CLI=https://github.com/spdk/nvme-cli DRIVER_LOCATION_QAT=https://01.org/sites/default/files/downloads/intelr-quickassist-technology/qat1.7.l.4.3.0-00033.tar.gz diff --git a/test/common/config/vm_setup.sh b/test/common/config/vm_setup.sh index 83a8c26cb72..9eb8eafe8b1 100755 --- a/test/common/config/vm_setup.sh +++ b/test/common/config/vm_setup.sh @@ -18,356 +18,91 @@ # We have made a lot of progress with removing hardcoded paths from the tests, +sudo() { + "$(type -P sudo)" -E "$@" +} + set -e +shopt -s extglob VM_SETUP_PATH=$(readlink -f ${BASH_SOURCE%/*}) UPGRADE=false INSTALL=false -CONF="rocksdb,fio,flamegraph,tsocks,qemu,vpp,libiscsi,nvmecli,qat,refspdk" -gcc_version=$(gcc -dumpversion) gcc_version=${gcc_version%%.*} - -if [ $(uname -s) == "FreeBSD" ]; then - OSID="freebsd" - OSVERSION=$(freebsd-version | cut -d. -f1) - PACKAGEMNG='pkg' -else - OSID=$(source /etc/os-release && echo $ID) - OSVERSION=$(source /etc/os-release && echo $VERSION_ID) - PACKAGEMNG='undefined' -fi - -function install_reference_spdk() { - local last_release - local output_dir - local config_params - local rootdir - - # Create a reference SPDK build for ABI tests - if echo $CONF | grep -q refspdk; then - git -C spdk_repo/spdk fetch --tags - last_release=$(git -C spdk_repo/spdk tag | sort --version-sort | grep -v rc | tail -n1) - git -C spdk_repo/spdk checkout $last_release - git -C spdk_repo/spdk submodule update --init - output_dir="$HOME/spdk_$(tr . _ < <(tr -d '[:alpha:]' <<< $last_release))" - - cp -r spdk_repo/spdk $output_dir - - cat > $HOME/autorun-spdk.conf << EOF -SPDK_BUILD_SHARED_OBJECT=1 -SPDK_TEST_AUTOBUILD=1 -SPDK_TEST_UNITTEST=1 -SPDK_TEST_BLOCKDEV=1 -SPDK_TEST_PMDK=1 -SPDK_TEST_ISAL=1 -SPDK_TEST_REDUCE=1 -SPDK_TEST_CRYPTO=1 -SPDK_TEST_FTL=1 -SPDK_TEST_OCF=1 -SPDK_TEST_RAID5=1 -SPDK_TEST_RBD=1 -SPDK_RUN_ASAN=1 -SPDK_RUN_UBSAN=1 -EOF - - mkdir -p $HOME/output - rootdir="$output_dir" - source $HOME/autorun-spdk.conf - source $output_dir/test/common/autotest_common.sh - - # Prepare separate, fixed, cmdline for the FreeBSD, Issue #1397. - if [[ $OSID == freebsd ]]; then - config_params="--enable-debug --enable-werror" - config_params+=" --with-idxd --with-fio=/usr/src/fio" - config_params+=" --disable-unit-tests --without-isal" - MAKE=gmake - else - config_params="$(get_config_params)" - fi - $output_dir/configure $(echo $config_params | sed 's/--enable-coverage//g') - if [[ $OSID != freebsd ]]; then - $MAKE -C $output_dir $MAKEFLAGS include/spdk/config.h - CONFIG_OCF_PATH="$output_dir/ocf" $MAKE -C $output_dir/lib/env_ocf $MAKEFLAGS exportlib O=$output_dir/build/ocf.a - $output_dir/configure $config_params --with-ocf=$output_dir/build/ocf.a --with-shared - fi - $MAKE -C $output_dir $MAKEFLAGS - fi -} - -function install_qat() { - - if [ "$PACKAGEMNG" = "dnf" ]; then - sudo dnf install -y libudev-devel - elif [ "$PACKAGEMNG" = "apt-get" ]; then - sudo apt-get install -y libudev-dev - fi - - if echo $CONF | grep -q qat; then - qat_tarball=$(basename $DRIVER_LOCATION_QAT) - kernel_maj=$(uname -r | cut -d'.' -f1) - kernel_min=$(uname -r | cut -d'.' -f2) - - sudo modprobe -r qat_c62x - if [ -d /QAT ]; then - sudo rm -rf /QAT/ - fi - - sudo mkdir /QAT - - wget $DRIVER_LOCATION_QAT - sudo cp $qat_tarball /QAT/ - (cd /QAT && sudo tar zxof /QAT/$qat_tarball) - - #The driver version 1.7.l.4.3.0-00033 contains a reference to a deprecated function. Remove it so the build won't fail. - if [ $kernel_maj -le 4 ]; then - if [ $kernel_min -le 17 ]; then - sudo sed -i 's/rdtscll(timestamp);/timestamp = rdtsc_ordered();/g' \ - /QAT/quickassist/utilities/osal/src/linux/kernel_space/OsalServices.c || true - fi - fi - - (cd /QAT && sudo ./configure --enable-icp-sriov=host && sudo make install) +CONF="rocksdb,fio,flamegraph,tsocks,qemu,libiscsi,nvmecli,qat,spdk,refspdk,vagrant" +package_manager= - if sudo service qat_service start; then - echo "failed to start the qat service. Something may be wrong with your device or package." - fi - fi -} - -function install_rocksdb() { - if echo $CONF | grep -q rocksdb; then - # Rocksdb is installed for use with the blobfs tests. - if [ ! -d /usr/src/rocksdb ]; then - git clone "${GIT_REPO_ROCKSDB}" - git -C ./rocksdb checkout spdk-v5.6.1 - sudo mv rocksdb /usr/src/ - else - sudo git -C /usr/src/rocksdb checkout spdk-v5.6.1 - echo "rocksdb already in /usr/src. Not checking out again" - fi - fi +function usage() { + echo "This script is intended to automate the environment setup for a linux virtual machine." + echo "Please run this script as your regular user. The script will make calls to sudo as needed." + echo "" + echo "./vm_setup.sh" + echo " -h --help" + echo " -u --upgrade Run $package_manager upgrade" + echo " -i --install-deps Install $package_manager based dependencies" + echo " -t --test-conf List of test configurations to enable (${CONF})" + echo " -c --conf-path Path to configuration file" + echo " -d --dir-git Path to where git sources should be saved" + echo " -s --disable-tsocks Disable use of tsocks" + exit ${1:-0} } -function install_fio() { - if echo $CONF | grep -q fio; then - # This version of fio is installed in /usr/src/fio to enable - # building the spdk fio plugin. - local fio_version="fio-3.19" - - if [ ! -d /usr/src/fio ]; then - if [ ! -d fio ]; then - git clone "${GIT_REPO_FIO}" - sudo mv fio /usr/src/ - else - sudo mv fio /usr/src/ - fi - ( - git -C /usr/src/fio checkout master \ - && git -C /usr/src/fio pull \ - && git -C /usr/src/fio checkout $fio_version \ - && if [ $OSID == 'freebsd' ]; then - gmake -C /usr/src/fio -j${jobs} \ - && sudo gmake -C /usr/src/fio install - else - make -C /usr/src/fio -j${jobs} \ - && sudo make -C /usr/src/fio install - fi - ) - else - echo "fio already in /usr/src/fio. Not installing" - fi - fi +function error() { + printf "%s\n\n" "$1" >&2 + usage 1 } -function install_flamegraph() { - if echo $CONF | grep -q flamegraph; then - # Flamegraph is used when printing out timing graphs for the tests. - if [ ! -d /usr/local/FlameGraph ]; then - git clone "${GIT_REPO_FLAMEGRAPH}" - mkdir -p /usr/local - sudo mv FlameGraph /usr/local/FlameGraph - else - echo "flamegraph already installed. Skipping" - fi +function set_os_id_version() { + if [[ $(uname -s) == FreeBSD ]] && ! pkg info -q etc_os-release; then + echo "Please install 'etc_os-release' package" >&2 + echo "pkg install -y etc_os-release" >&2 + exit 2 fi -} - -function install_qemu() { - if echo $CONF | grep -q qemu; then - # Two versions of QEMU are used in the tests. - # Stock QEMU is used for vhost. A special fork - # is used to test OCSSDs. Install both. - - # Packaged QEMU - if [ "$PACKAGEMNG" = "dnf" ]; then - sudo dnf install -y qemu-system-x86 qemu-img - elif [ "$PACKAGEMNG" = "apt-get" ]; then - sudo apt-get install -y qemu-system-x86 qemu-utils - elif [ "$PACKAGEMNG" = "pacman" ]; then - sudo pacman -Sy --needed --noconfirm qemu - elif [[ $PACKAGEMNG == "yum" ]]; then - sudo yum install -y qemu-system-x86 qemu-img - fi - - # Forked QEMU - SPDK_QEMU_BRANCH=spdk-5.0.0 - mkdir -p qemu - if [ ! -d "qemu/$SPDK_QEMU_BRANCH" ]; then - git -C ./qemu clone "${GIT_REPO_QEMU}" -b "$SPDK_QEMU_BRANCH" "$SPDK_QEMU_BRANCH" - else - echo "qemu already checked out. Skipping" - fi - - declare -a opt_params=("--prefix=/usr/local/qemu/$SPDK_QEMU_BRANCH") - if ((gcc_version >= 9)); then - # GCC 9 fails to compile Qemu due to some old warnings which were not detected by older versions. - opt_params+=("--extra-cflags=-Wno-error=stringop-truncation -Wno-error=deprecated-declarations -Wno-error=incompatible-pointer-types -Wno-error=format-truncation") - opt_params+=("--disable-glusterfs") - fi - # Most tsocks proxies rely on a configuration file in /etc/tsocks.conf. - # If using tsocks, please make sure to complete this config before trying to build qemu. - if echo $CONF | grep -q tsocks; then - if hash tsocks 2> /dev/null; then - opt_params+=("--with-git='tsocks git'") - fi - fi - - sed -i s@git://git.qemu.org/@https://github.com/qemu/@g qemu/$SPDK_QEMU_BRANCH/.gitmodules - sed -i s@git://git.qemu.org/@https://github.com/qemu/@g qemu/$SPDK_QEMU_BRANCH/.git/config - sed -i s@git://git.qemu-project.org/@https://github.com/qemu/@g qemu/$SPDK_QEMU_BRANCH/.gitmodules - sed -i s@git://git.qemu-project.org/@https://github.com/qemu/@g qemu/$SPDK_QEMU_BRANCH/.git/config - # The qemu configure script places several output files in the CWD. - (cd qemu/$SPDK_QEMU_BRANCH && ./configure "${opt_params[@]}" --target-list="x86_64-softmmu" --enable-kvm --enable-linux-aio --enable-numa) - - make -C ./qemu/$SPDK_QEMU_BRANCH -j${jobs} - sudo make -C ./qemu/$SPDK_QEMU_BRANCH install + if [[ -f /etc/os-release ]]; then + source /etc/os-release + elif [[ -f /usr/local/etc/os-release ]]; then + # On FreeBSD file is located under /usr/local if etc_os-release package is installed + source /usr/local/etc/os-release + else + echo "File os-release not found" >&2 + exit 3 fi -} - -function install_vpp() { - if echo $CONF | grep -q vpp; then - if [ -d /usr/local/src/vpp ]; then - echo "vpp already cloned." - if [ ! -d /usr/local/src/vpp/build-root ]; then - echo "build-root has not been done" - echo "remove the $(pwd) and start again" - exit 1 - fi - else - git clone "${GIT_REPO_VPP}" - git -C ./vpp checkout v19.04.2 - - if [ "${OSID}" == 'fedora' ]; then - if [ ${OSVERSION} -eq 29 ]; then - git -C ./vpp apply ${VM_SETUP_PATH}/patch/vpp/fedora29-fix.patch - fi - if [ ${OSVERSION} -eq 30 ]; then - git -C ./vpp apply ${VM_SETUP_PATH}/patch/vpp/fedora30-fix.patch - fi - if ((OVERSION == 31)); then - git -C ./vpp apply "$VM_SETUP_PATH/patch/vpp/fedora31-fix.patch" - fi - fi - - # vpp depends on python-ply, however some packages on different Fedoras don't - # provide ply.lex. To make sure vpp won't fail, try to reinstall ply via pip. - sudo pip3 uninstall -y ply || true - sudo pip3 install ply || true - # Installing required dependencies for building VPP - yes | make -C ./vpp install-dep + OSID=$ID + OSVERSION=$VERSION_ID - make -C ./vpp build -j${jobs} - - sudo mv ./vpp /usr/local/src/vpp-19.04 - fi - fi + echo "OS-ID: $OSID | OS-Version: $OSVERSION" } -function install_nvmecli() { - if echo $CONF | grep -q nvmecli; then - SPDK_NVME_CLI_BRANCH=spdk-1.6 - if [ ! -d nvme-cli ]; then - git clone "${GIT_REPO_SPDK_NVME_CLI}" -b "$SPDK_NVME_CLI_BRANCH" - else - echo "nvme-cli already checked out. Skipping" - fi - if [ ! -d "/usr/local/src/nvme-cli" ]; then - # Changes required for SPDK are already merged on top of - # nvme-cli, however not released yet. - # Support for SPDK should be released in nvme-cli >1.11.1 - git clone "https://github.com/linux-nvme/nvme-cli.git" "nvme-cli-cuse" - git -C ./nvme-cli-cuse checkout "e770466615096a6d41f038a28819b00bc3078e1d" - make -C ./nvme-cli-cuse - sudo mv ./nvme-cli-cuse /usr/local/src/nvme-cli - fi - fi -} +function detect_package_manager() { + local manager_scripts + manager_scripts=("$vmsetupdir/pkgdep/"!(git)) -function install_libiscsi() { - if echo $CONF | grep -q libiscsi; then - # We currently don't make any changes to the libiscsi repository for our tests, but it is possible that we will need - # to later. Cloning from git is just future proofing the machines. - if [ ! -d libiscsi ]; then - git clone "${GIT_REPO_LIBISCSI}" - else - echo "libiscsi already checked out. Skipping" + local package_manager_lib + for package_manager_lib in "${manager_scripts[@]}"; do + package_manager=${package_manager_lib##*/} + if hash "${package_manager}" &> /dev/null; then + source "${package_manager_lib}" + return fi - (cd libiscsi && ./autogen.sh && ./configure --prefix=/usr/local/libiscsi) - make -C ./libiscsi -j${jobs} - sudo make -C ./libiscsi install - fi -} + done -function install_git() { - sudo yum install -y zlib-devel curl-devel - tar -xzof <(wget -qO- "$GIT_REPO_GIT") - (cd git-${GIT_VERSION} \ - && make configure \ - && ./configure --prefix=/usr/local/git \ - && sudo make -j${jobs} install) - sudo sh -c "echo 'export PATH=/usr/local/git/bin:$PATH' >> /etc/bashrc" - exec $SHELL + package_manager="undefined" } -function usage() { - echo "This script is intended to automate the environment setup for a linux virtual machine." - echo "Please run this script as your regular user. The script will make calls to sudo as needed." - echo "" - echo "./vm_setup.sh" - echo " -h --help" - echo " -u --upgrade Run $PACKAGEMNG upgrade" - echo " -i --install-deps Install $PACKAGEMNG based dependencies" - echo " -t --test-conf List of test configurations to enable (${CONF})" - echo " -c --conf-path Path to configuration file" - exit 0 -} +vmsetupdir=$(readlink -f "$(dirname "$0")") +rootdir=$(readlink -f "$vmsetupdir/../../../") -# Get package manager # -if hash yum &> /dev/null; then - PACKAGEMNG=yum -elif hash dnf &> /dev/null; then - PACKAGEMNG=dnf -elif hash apt-get &> /dev/null; then - PACKAGEMNG=apt-get -elif hash pacman &> /dev/null; then - PACKAGEMNG=pacman -elif hash pkg &> /dev/null; then - PACKAGEMNG=pkg -else - echo 'Supported package manager not found. Script supports "dnf" and "apt-get".' -fi +set_os_id_version +detect_package_manager -if [ $PACKAGEMNG == 'apt-get' ] && [ $OSID != 'ubuntu' ]; then - echo 'Located apt-get package manager, but it was tested for Ubuntu only' -fi -if [ $PACKAGEMNG == 'dnf' ] && [ $OSID != 'fedora' ]; then - echo 'Located dnf package manager, but it was tested for Fedora only' +if [[ -e $vmsetupdir/pkgdep/os/$OSID ]]; then + source "$vmsetupdir/pkgdep/os/$OSID" fi # Parse input arguments # -while getopts 'iuht:c:-:' optchar; do +while getopts 'd:siuht:c:-:' optchar; do case "$optchar" in -) case "$OPTARG" in @@ -376,10 +111,9 @@ while getopts 'iuht:c:-:' optchar; do install-deps) INSTALL=true ;; test-conf=*) CONF="${OPTARG#*=}" ;; conf-path=*) CONF_PATH="${OPTARG#*=}" ;; - *) - echo "Invalid argument '$OPTARG'" - usage - ;; + dir-git=*) GIT_REPOS="${OPTARG#*=}" ;; + disable-tsocks) NO_TSOCKS=true ;; + *) error "Invalid argument '$OPTARG'" ;; esac ;; h) usage ;; @@ -387,357 +121,37 @@ while getopts 'iuht:c:-:' optchar; do i) INSTALL=true ;; t) CONF="$OPTARG" ;; c) CONF_PATH="$OPTARG" ;; - *) - echo "Invalid argument '$OPTARG'" - usage - ;; + d) GIT_REPOS="$OPTARG" ;; + s) NO_TSOCKS=true ;; + *) error "Invalid argument '$OPTARG'" ;; esac done -if [ -n "$CONF_PATH" ]; then - if [ ! -f "$CONF_PATH" ]; then - echo Configuration file does not exist: "$CONF_PATH" - exit 1 - else - source "$CONF_PATH" - fi +if [[ $package_manager == undefined ]]; then + echo "Supported package manager not found. Script supports:" + printf " * %s\n" "${manager_scripts[@]##*/}" + exit 1 fi -cd ~ -GIT_VERSION=2.25.1 -: ${GIT_REPO_SPDK=https://github.com/spdk/spdk.git} -export GIT_REPO_SPDK -: ${GIT_REPO_DPDK=https://github.com/spdk/dpdk.git} -export GIT_REPO_DPDK -: ${GIT_REPO_ROCKSDB=https://review.spdk.io/spdk/rocksdb} -export GIT_REPO_ROCKSDB -: ${GIT_REPO_FIO=http://git.kernel.dk/fio.git} -export GIT_REPO_FIO -: ${GIT_REPO_FLAMEGRAPH=https://github.com/brendangregg/FlameGraph.git} -export GIT_REPO_FLAMEGRAPH -: ${GIT_REPO_QEMU=https://github.com/spdk/qemu} -export GIT_REPO_QEMU -: ${GIT_REPO_VPP=https://gerrit.fd.io/r/vpp} -export GIT_REPO_VPP -: ${GIT_REPO_LIBISCSI=https://github.com/sahlberg/libiscsi} -export GIT_REPO_LIBISCSI -: ${GIT_REPO_SPDK_NVME_CLI=https://github.com/spdk/nvme-cli} -export GIT_REPO_SPDK_NVME_CLI -: ${GIT_REPO_INTEL_IPSEC_MB=https://github.com/spdk/intel-ipsec-mb.git} -export GIT_REPO_INTEL_IPSEC_MB -: ${DRIVER_LOCATION_QAT=https://01.org/sites/default/files/downloads//qat1.7.l.4.9.0-00008.tar.gz} -export DRIVER_LOCATION_QAT -: ${GIT_REPO_GIT=https://github.com/git/git/archive/v${GIT_VERSION}.tar.gz} -export GIT_REPO_GIT - -if [ $PACKAGEMNG == 'pkg' ]; then - jobs=$(($(sysctl -n hw.ncpu) * 2)) -else - jobs=$(($(nproc) * 2)) -fi - -if $UPGRADE; then - if [ $PACKAGEMNG == 'yum' ]; then - sudo $PACKAGEMNG upgrade -y - elif [ $PACKAGEMNG == 'dnf' ]; then - sudo $PACKAGEMNG upgrade -y - elif [ $PACKAGEMNG == 'apt-get' ]; then - sudo $PACKAGEMNG update - sudo $PACKAGEMNG upgrade -y - elif [ $PACKAGEMNG == 'pacman' ]; then - sudo $PACKAGEMNG -Syu --noconfirm --needed - elif [ $PACKAGEMNG == 'pkg' ]; then - sudo $PACKAGEMNG upgrade -y - fi -fi - -if $INSTALL; then - if [ "${OSID} ${OSVERSION}" == 'centos 8' ]; then - #During install using vm_setup.sh there is error with AppStream, to fix it we need to refresh yum - sudo yum update -y --refresh - fi - sudo spdk_repo/spdk/scripts/pkgdep.sh --all - - if [ $PACKAGEMNG == 'pkg' ]; then - sudo pkg install -y pciutils \ - jq \ - gdb \ - fio \ - p5-extutils-pkgconfig \ - libtool \ - flex \ - bison \ - gdisk \ - socat \ - sshpass \ - py37-pandas \ - wget - - elif [ $PACKAGEMNG == 'yum' ]; then - sudo yum install -y pciutils \ - valgrind \ - jq \ - nvme-cli \ - gdb \ - fio \ - librbd-devel \ - kernel-devel \ - gflags-devel \ - libasan \ - libubsan \ - autoconf \ - automake \ - libtool \ - libmount-devel \ - iscsi-initiator-utils \ - isns-utils-devel pmempool \ - perl-open \ - glib2-devel \ - pixman-devel \ - astyle-devel \ - elfutils \ - elfutils-libelf-devel \ - flex \ - bison \ - targetcli \ - perl-Switch \ - librdmacm-utils \ - libibverbs-utils \ - gdisk \ - socat \ - sshfs \ - sshpass \ - python3-pandas \ - rpm-build \ - iptables \ - clang-analyzer \ - bc \ - kernel-modules-extra \ - systemd-devel \ - python3 \ - wget - - sudo yum install -y nbd || { - wget -O nbd.rpm https://download-ib01.fedoraproject.org/pub/epel/7/x86_64/Packages/n/nbd-3.14-2.el7.x86_64.rpm - sudo yum install -y nbd.rpm - } - - elif [ $PACKAGEMNG == 'dnf' ]; then - if echo $CONF | grep -q tsocks; then - # currently, tsocks package is retired in fedora 31, so don't exit in case - # installation failed - # FIXME: Review when fedora starts to successfully build this package again. - sudo dnf install -y tsocks || echo "Installation of the tsocks package failed, proxy may not be available" - fi - - sudo dnf install -y \ - valgrind \ - jq \ - nvme-cli \ - ceph \ - gdb \ - fio \ - librbd-devel \ - kernel-devel \ - gflags-devel \ - libasan \ - libubsan \ - autoconf \ - automake \ - libtool \ - libmount-devel \ - iscsi-initiator-utils \ - isns-utils-devel \ - pmempool \ - perl-open \ - glib2-devel \ - pixman-devel \ - astyle-devel \ - elfutils \ - libabigail \ - elfutils-libelf-devel \ - flex \ - bison \ - targetcli \ - perl-Switch \ - librdmacm-utils \ - libibverbs-utils \ - gdisk \ - socat \ - sshfs \ - sshpass \ - python3-pandas \ - btrfs-progs \ - rpm-build \ - iptables \ - clang-analyzer \ - bc \ - kernel-modules-extra \ - systemd-devel \ - smartmontools \ - wget - - elif [ $PACKAGEMNG == 'apt-get' ]; then - echo "Package perl-open is not available at Ubuntu repositories" >&2 - - if echo $CONF | grep -q tsocks; then - sudo apt-get install -y tsocks - fi - - # asan an ubsan have to be installed together to not mix up gcc versions - if sudo apt-get install -y libasan5; then - sudo apt-get install -y libubsan1 - else - echo "Latest libasan5 is not available" >&2 - echo " installing libasan2 and corresponding libubsan0" >&2 - sudo apt-get install -y libasan2 - sudo apt-get install -y libubsan0 - fi - if ! sudo apt-get install -y rdma-core; then - echo "Package rdma-core is avaliable at Ubuntu 18 [universe] repositorium" >&2 - sudo apt-get install -y rdmacm-utils - sudo apt-get install -y ibverbs-utils - fi - if ! sudo apt-get install -y libpmempool1; then - echo "Package libpmempool1 is available at Ubuntu 18 [universe] repositorium" >&2 - fi - if ! sudo apt-get install -y clang-tools; then - echo "Package clang-tools is available at Ubuntu 18 [universe] repositorium" >&2 - fi - if ! sudo apt-get install -y --no-install-suggests --no-install-recommends open-isns-utils; then - echo "Package open-isns-utils is available at Ubuntu 18 [universe] repositorium" >&2 - fi - - # Package name for Ubuntu 18 is targetcli-fb but for Ubuntu 16 it's targetcli - if ! sudo apt-get install -y targetcli-fb; then - sudo apt-get install -y targetcli - fi - - # On Ubuntu 20.04 (focal) btrfs-tools are available under different name - btrfs-progs - if ! sudo apt-get install -y btrfs-tools; then - sudo apt-get install -y btrfs-progs - fi - - sudo apt-get install -y \ - valgrind \ - jq \ - nvme-cli \ - ceph \ - gdb \ - fio \ - librbd-dev \ - linux-headers-generic \ - libgflags-dev \ - autoconf \ - automake \ - libtool \ - libmount-dev \ - open-iscsi \ - libglib2.0-dev \ - libpixman-1-dev \ - astyle \ - elfutils \ - libelf-dev \ - flex \ - bison \ - libswitch-perl \ - gdisk \ - socat \ - sshfs \ - sshpass \ - python3-pandas \ - bc \ - smartmontools \ - wget - - # rpm-build is not used - # iptables installed by default - - elif [ $PACKAGEMNG == 'pacman' ]; then - if echo $CONF | grep -q tsocks; then - sudo pacman -Sy --noconfirm --needed tsocks - fi - - sudo pacman -Sy --noconfirm --needed valgrind \ - jq \ - nvme-cli \ - ceph \ - gdb \ - fio \ - linux-headers \ - gflags \ - autoconf \ - automake \ - libtool \ - libutil-linux \ - libiscsi \ - open-isns \ - glib2 \ - pixman \ - flex \ - bison \ - elfutils \ - libelf \ - astyle \ - gptfdisk \ - socat \ - sshfs \ - sshpass \ - python-pandas \ - btrfs-progs \ - iptables \ - clang \ - bc \ - perl-switch \ - open-iscsi \ - smartmontools \ - parted \ - wget - - # TODO: - # These are either missing or require some other installation method - # than pacman: - - # librbd-devel - # perl-open - # targetcli - +if [[ -n $CONF_PATH ]]; then + if [[ -f $CONF_PATH ]]; then + source "$CONF_PATH" else - echo "Package manager is undefined, skipping INSTALL step" - fi - - if [ "${OSID} ${OSVERSION}" == 'centos 7' ]; then - install_git + error "Configuration file does not exist: '$CONF_PATH'" fi fi -mkdir -p spdk_repo/output || echo "Can not create spdk_repo/output directory." - -if [ -d spdk_repo/spdk ]; then - echo "spdk source already present, not cloning" -else - git -C spdk_repo clone "${GIT_REPO_SPDK}" +if $UPGRADE; then + upgrade fi -git -C spdk_repo/spdk config submodule.dpdk.url "${GIT_REPO_DPDK}" -git -C spdk_repo/spdk config submodule.intel-ipsec-mb.url "${GIT_REPO_INTEL_IPSEC_MB}" -git -C spdk_repo/spdk submodule update --init --recursive - -sudo mkdir -p /usr/src -if [ $OSID != 'freebsd' ]; then - install_libiscsi & - install_vpp & - install_nvmecli & - install_qat & - install_rocksdb & - install_flamegraph & - install_qemu & +if $INSTALL; then + sudo "$rootdir/scripts/pkgdep.sh" --all + pre_install + install "${packages[@]}" fi -install_fio & - -wait -install_reference_spdk +source "$vmsetupdir/pkgdep/git" # create autorun-spdk.conf in home folder. This is sourced by the autotest_common.sh file. # By setting any one of the values below to 0, you can skip that specific test. If you are @@ -745,7 +159,7 @@ install_reference_spdk # probably best to only run the tests that you believe your changes have modified along with # Scanbuild and check format. This is because running the whole suite of tests in series can # take ~40 minutes to complete. -if [ ! -e ~/autorun-spdk.conf ]; then +if [[ ! -e ~/autorun-spdk.conf ]]; then cat > ~/autorun-spdk.conf << EOF # assign a value of 1 to all of the pertinent tests SPDK_RUN_VALGRIND=1 @@ -771,8 +185,6 @@ SPDK_TEST_IOAT=0 # requires some extra configuration. see TEST_ENV_SETUP_README SPDK_TEST_VHOST=0 SPDK_TEST_VHOST_INIT=0 -# Not configured here -SPDK_RUN_INSTALLED_DPDK=0 EOF fi diff --git a/test/common/lib/test_env.c b/test/common/lib/test_env.c index 49d874dfecb..5e2912b5c71 100644 --- a/test/common/lib/test_env.c +++ b/test/common/lib/test_env.c @@ -47,6 +47,8 @@ void free_cores(void); DEFINE_STUB(spdk_process_is_primary, bool, (void), true) DEFINE_STUB(spdk_memzone_lookup, void *, (const char *name), NULL) +DEFINE_STUB_V(spdk_pci_driver_register, (const char *name, struct spdk_pci_id *id_table, + uint32_t flags)); DEFINE_STUB(spdk_pci_nvme_get_driver, struct spdk_pci_driver *, (void), NULL) DEFINE_STUB(spdk_pci_ioat_get_driver, struct spdk_pci_driver *, (void), NULL) DEFINE_STUB(spdk_pci_virtio_get_driver, struct spdk_pci_driver *, (void), NULL) diff --git a/test/common/skipped_build_files.txt b/test/common/skipped_build_files.txt index dca9676814a..d138d62a5bc 100644 --- a/test/common/skipped_build_files.txt +++ b/test/common/skipped_build_files.txt @@ -1,9 +1,6 @@ # Not configured to test vtune. lib/bdev/vtune -# Not configured to test VPP -module/sock/vpp/vpp - # Not configured to test rocksdb env file lib/rocksdb/env_spdk.cc diff --git a/test/common/skipped_tests.txt b/test/common/skipped_tests.txt index d96957f2b99..56b8133a498 100644 --- a/test/common/skipped_tests.txt +++ b/test/common/skipped_tests.txt @@ -9,9 +9,6 @@ ftl_restore_nv_cache # Waiting for test refactor iscsi_tgt_fio_remote_nvme -# VPP deprecated with 20.07 -iscsi_tgt_vpp - # Waiting on significant test rewrite nvme_opal nvme_opal_bdevio @@ -24,9 +21,6 @@ spdkcli_nvmf_fc unittest_nvmf_fc unittest_nvmf_fc_ls -# Enable after cuse tests switch to physical devices -nvme_ns_manage_cuse - # These tests are currently only run manually vhost_blk_fs_integrity vhost_blk_hot_remove diff --git a/test/compress/compress.sh b/test/compress/compress.sh index fea26aab86c..0c67f70212b 100755 --- a/test/compress/compress.sh +++ b/test/compress/compress.sh @@ -33,7 +33,11 @@ function create_vols() { waitforbdev lvs0/lv0 $rpc_py compress_set_pmd -p "$pmd" - $rpc_py bdev_compress_create -b lvs0/lv0 -p /tmp/pmem + if [ -z "$1" ]; then + $rpc_py bdev_compress_create -b lvs0/lv0 -p /tmp/pmem + else + $rpc_py bdev_compress_create -b lvs0/lv0 -p /tmp/pmem -l $1 + fi waitforbdev COMP_lvs0/lv0 } @@ -54,7 +58,7 @@ function run_bdevperf() { bdevperf_pid=$! trap 'killprocess $bdevperf_pid; error_cleanup; exit 1' SIGINT SIGTERM EXIT waitforlisten $bdevperf_pid - create_vols + create_vols $4 $rootdir/test/bdev/bdevperf/bdevperf.py perform_tests destroy_vols trap - SIGINT SIGTERM EXIT @@ -78,7 +82,10 @@ esac mkdir -p /tmp/pmem # per patch bdevperf uses slightly different params than nightly +# logical block size same as underlying device, then 512 then 4096 run_bdevperf 32 4096 3 +run_bdevperf 32 4096 3 512 +run_bdevperf 32 4096 3 4096 if [ $RUN_NIGHTLY -eq 1 ]; then run_bdevio diff --git a/test/dd/basic_rw.sh b/test/dd/basic_rw.sh new file mode 100755 index 00000000000..ff86fd84eae --- /dev/null +++ b/test/dd/basic_rw.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +testdir=$(readlink -f "$(dirname "$0")") +rootdir=$(readlink -f "$testdir/../../") +source "$testdir/common.sh" + +basic_rw() { + local native_bs=$1 + local count size + local qds bss + + qds=(1 64) + # Generate some bs for tests based on the native_bs + for bs in {0..2}; do + bss+=($((native_bs << bs))) + done + + for bs in "${bss[@]}"; do + for qd in "${qds[@]}"; do + count=$((0xffff / bs)) + count=$((count == 0 ? 1 : count)) + size=$((count * bs)) + + gen_bytes "$size" > "$test_file0" + + "${DD_APP[@]}" \ + --if="$test_file0" \ + --ob="$bdev0" \ + --bs="$bs" \ + --qd="$qd" \ + --json <(gen_conf) + + "${DD_APP[@]}" \ + --ib="$bdev0" \ + --of="$test_file1" \ + --bs="$bs" \ + --qd="$qd" \ + --count="$count" \ + --json <(gen_conf) + + diff -q "$test_file0" "$test_file1" + clear_nvme "$bdev0" "" "$size" + done + done +} + +basic_offset() { + # Check if offseting works - using default io size of 4k + local count seek skip data data_check + + gen_bytes 4096 > "$test_file0" + ((count = seek = skip = 1)) + data=$(< "$test_file0") + + "${DD_APP[@]}" \ + --if="$test_file0" \ + --ob="$bdev0" \ + --seek="$seek" \ + --json <(gen_conf) + + "${DD_APP[@]}" \ + --ib="$bdev0" \ + --of="$test_file1" \ + --skip="$skip" \ + --count="$count" \ + --json <(gen_conf) + + read -rn${#data} data_check < "$test_file1" + [[ $data == "$data_check" ]] +} + +cleanup() { + clear_nvme "$bdev0" + rm -f "$test_file0" "$test_file1" +} + +trap "cleanup" EXIT + +nvmes=("$@") +nvme0=Nvme0 nvme0_pci=${nvmes[0]} bdev0=Nvme0n1 + +declare -A method_bdev_nvme_attach_controller_0=( + ["name"]=$nvme0 + ["traddr"]=$nvme0_pci + ["trtype"]=pcie +) + +test_file0=$SPDK_TEST_STORAGE/dd.dump0 +test_file1=$SPDK_TEST_STORAGE/dd.dump1 +native_bs=$(get_native_nvme_bs "$nvme0_pci") + +# Test if running with bs < native_bs successfully fails +run_test "dd_bs_lt_native_bs" \ + NOT "${DD_APP[@]}" \ + --if=<(:) \ + --ob="$bdev0" \ + --bs=$((native_bs >> 1)) \ + --json <(gen_conf) + +run_test "dd_rw" basic_rw "$native_bs" +run_test "dd_rw_offset" basic_offset diff --git a/test/dd/bdev_to_bdev.sh b/test/dd/bdev_to_bdev.sh new file mode 100755 index 00000000000..acb2ec4c37c --- /dev/null +++ b/test/dd/bdev_to_bdev.sh @@ -0,0 +1,111 @@ +#!/usr/bin/env bash +testdir=$(readlink -f "$(dirname "$0")") +rootdir=$(readlink -f "$testdir/../../") +source "$testdir/common.sh" + +nvmes=("$@") + +offset_magic() { + local magic_check + local offsets offset + + offsets=(16 256 1024) # * bs + + for offset in "${offsets[@]}"; do + "${DD_APP[@]}" \ + --ib="$bdev0" \ + --ob="$bdev1" \ + --count="$count" \ + --seek="$offset" \ + --bs="$bs" \ + --json <(gen_conf) + + "${DD_APP[@]}" \ + --ib="$bdev1" \ + --of="$test_file1" \ + --count=1 \ + --skip="$offset" \ + --bs="$bs" \ + --json <(gen_conf) + + read -rn${#magic} magic_check < "$test_file1" + [[ $magic_check == "$magic" ]] + done +} + +cleanup() { + # Zero up to 1G on input|output bdev + clear_nvme "$bdev0" "" $((0x40000000 + ${#magic})) + clear_nvme "$bdev1" "" $((0x40000000 + ${#magic})) + rm -f "$test_file0" "$test_file1" "$aio1" +} + +trap "cleanup" EXIT + +bs=$((1024 << 10)) + +if ((${#nvmes[@]} > 1)); then + nvme0=Nvme0 bdev0=Nvme0n1 nvme0_pci=${nvmes[0]} # input bdev + nvme1=Nvme1 bdev1=Nvme1n1 nvme1_pci=${nvmes[1]} # output bdev + + declare -A method_bdev_nvme_attach_controller_0=( + ["name"]=$nvme0 + ["traddr"]=$nvme0_pci + ["trtype"]=pcie + ) + declare -A method_bdev_nvme_attach_controller_1=( + ["name"]=$nvme1 + ["traddr"]=$nvme1_pci + ["trtype"]=pcie + ) +else + # Use AIO to compensate lack of actual hardware + nvme0=Nvme0 bdev0=Nvme0n1 nvme0_pci=${nvmes[0]} # input bdev + aio1=$SPDK_TEST_STORAGE/aio1 bdev1=aio1 # output bdev + + declare -A method_bdev_nvme_attach_controller_1=( + ["name"]=$nvme0 + ["traddr"]=$nvme0_pci + ["trtype"]=pcie + ) + declare -A method_bdev_aio_create_0=( + ["name"]=$bdev1 + ["filename"]=$aio1 + ["block_size"]=4096 + ) + + # 2G AIO file + "${DD_APP[@]}" \ + --if=/dev/zero \ + --of="$aio1" \ + --bs="$bs" \ + --count=2048 +fi + +test_file0=$SPDK_TEST_STORAGE/dd.dump0 +test_file1=$SPDK_TEST_STORAGE/dd.dump1 + +magic="This Is Our Magic, find it" +echo "$magic" > "$test_file0" + +# Make the file a bit bigger (~512MB) +run_test "dd_inflate_file" \ + "${DD_APP[@]}" \ + --if=/dev/zero \ + --of="$test_file0" \ + --oflag=append \ + --bs="$bs" \ + --count=512 + +test_file0_size=$(wc -c < "$test_file0") + +# Now, copy it over to first nvme with default bs (4k) +run_test "dd_copy_to_out_bdev" \ + "${DD_APP[@]}" \ + --if="$test_file0" \ + --ob="$bdev0" \ + --json <(gen_conf) + +count=$(((test_file0_size / bs) + 1)) + +run_test "dd_offset_magic" offset_magic diff --git a/test/dd/common.sh b/test/dd/common.sh new file mode 100644 index 00000000000..d2f7defa3f1 --- /dev/null +++ b/test/dd/common.sh @@ -0,0 +1,154 @@ +source "$rootdir/test/common/autotest_common.sh" +source "$rootdir/scripts/common.sh" + +clear_nvme() { + local bdev=$1 + local nvme_ref=$2 + local size=${3:-0xffff} + + local bs=$((1024 << 10)) # 1M + local count=$(((size / bs) + (size % bs ? 1 : 0))) + + "${DD_APP[@]}" \ + --if="/dev/zero" \ + --bs="$bs" \ + --ob="$bdev" \ + --count="$count" \ + --json <(gen_conf $nvme_ref) +} + +trunc_files() { + local f + for f; do : > "$f"; done +} + +gen_conf() { + xtrace_disable + + local ref_name + local method methods + local param params + local config + + # Pick references to all assoc arrays and build subsystem's config + # around them. The assoc array should be the name of the rpc method + # suffixed with unique _ID (ID may be any string). Default arrays + # should be prefixed with _method string. The keys of the array + # should store names of the method's parameters - proper quoting + # of the values is done here. extra_subsystems[] can store extra + # json configuration for different subsystems, other than bdev. + + methods=("${@:-${!method_@}}") + local IFS="," + + for ref_name in "${methods[@]}"; do + method=${ref_name#*method_} method=${method%_*} params=() + + # FIXME: centos7's Bash got trapped in 2011: + # local -n ref=$ref_name -> local: -n: invalid option + # HACK: it with eval and partial refs instead. + eval "local refs=(\${!${ref_name}[@]})" + local param_ref + + for param in "${refs[@]}"; do + param_ref="${ref_name}[$param]" + if [[ ${!param_ref} =~ ^([0-9]+|true|false|\{.*\})$ ]]; then + params+=("\"$param\": ${!param_ref}") + else + params+=("\"$param\": \"${!param_ref}\"") + fi + done + + config+=("$( + cat <<- JSON + { + "params": { + ${params[*]} + }, + "method": "$method" + } + JSON + )") + done + + jq . <<- JSON | tee /dev/stderr + { + "subsystems": [ + { + "subsystem": "bdev", + "config": [ + ${config[*]} + ] + } + ${extra_subsystems[*]:+,${extra_subsystems[*]}} + ] + } + JSON + + xtrace_restore +} + +gen_bytes() { + xtrace_disable + + local max=$1 + local bytes + local byte + local string + shift + + bytes=({a..z} {0..9}) + if (($#)); then + bytes=("$@") + fi + + for ((byte = 0; byte < max; byte++)); do + string+=${bytes[RANDOM % ${#bytes[@]}]} + done + printf '%b' "$string" + + xtrace_restore +} + +get_native_nvme_bs() { + # This is now needed since spdk_dd will reject all bs smaller than the + # native bs of given nvme. We need to make sure all tests are using + # bs >= native_bs. Use identify here so we don't have to switch nvmes + # between user space and the kernel back and forth. + local pci=$1 lbaf id + + mapfile -t id < <("$rootdir/build/examples/identify" -r trtype:pcie "traddr:$pci") + + # Get size of the current LBAF + [[ ${id[*]} =~ "Current LBA Format:"\ *"LBA Format #"([0-9]+) ]] + lbaf=${BASH_REMATCH[1]} + [[ ${id[*]} =~ "LBA Format #$lbaf: Data Size:"\ *([0-9]+) ]] + lbaf=${BASH_REMATCH[1]} + + echo "$lbaf" +} + +check_liburing() { + # Simply check if spdk_dd links to liburing. If yes, log that information. + local lib so + local -g liburing_in_use=0 + + while read -r lib _ so _; do + if [[ $lib == liburing.so.* ]]; then + printf '* spdk_dd linked to liburing\n' + # For sanity, check build config to see if liburing was requested. + if [[ -e $rootdir/test/common/build_config.sh ]]; then + source "$rootdir/test/common/build_config.sh" + fi + if [[ $CONFIG_URING != y ]]; then + printf '* spdk_dd built with liburing, but no liburing support requested?\n' + fi + if [[ ! -e $so ]]; then + printf '* %s is missing, aborting\n' "$lib" + return 1 + fi + export liburing_in_use=1 + return 0 + fi + done < <(LD_TRACE_LOADED_OBJECTS=1 "${DD_APP[@]}") >&2 +} diff --git a/test/dd/dd.sh b/test/dd/dd.sh new file mode 100755 index 00000000000..e2b8bb86a09 --- /dev/null +++ b/test/dd/dd.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +testdir=$(readlink -f "$(dirname "$0")") +rootdir=$(readlink -f "$testdir/../../") +source "$testdir/common.sh" + +"$rootdir/scripts/setup.sh" +nvmes=($(nvme_in_userspace)) + +check_liburing + +run_test "spdk_dd_basic_rw" "$testdir/basic_rw.sh" "${nvmes[@]}" +run_test "spdk_dd_posix" "$testdir/posix.sh" +run_test "spdk_dd_bdev_to_bdev" "$testdir/bdev_to_bdev.sh" "${nvmes[@]}" diff --git a/test/dd/posix.sh b/test/dd/posix.sh new file mode 100755 index 00000000000..15346d8d3f3 --- /dev/null +++ b/test/dd/posix.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +testdir=$(readlink -f "$(dirname "$0")") +rootdir=$(readlink -f "$testdir/../../") +source "$testdir/common.sh" + +cleanup() { + rm -f "$test_file0"{,.link} + rm -f "$test_file1"{,.link} +} + +append() { + local dump0 + local dump1 + + dump0=$(gen_bytes 32) + dump1=$(gen_bytes 32) + + printf '%s' "$dump0" > "$test_file0" + printf '%s' "$dump1" > "$test_file1" + + "${DD_APP[@]}" --if="$test_file0" --of="$test_file1" --oflag=append + + [[ $(< "$test_file1") == "${dump1}${dump0}" ]] +} + +directory() { + NOT "${DD_APP[@]}" --if="$test_file0" --iflag=directory --of="$test_file0" + NOT "${DD_APP[@]}" --if="$test_file0" --of="$test_file0" --oflag=directory +} + +nofollow() { + local test_file0_link=$test_file0.link + local test_file1_link=$test_file1.link + + ln -fs "$test_file0" "$test_file0_link" + ln -fs "$test_file1" "$test_file1_link" + + NOT "${DD_APP[@]}" --if="$test_file0_link" --iflag=nofollow --of="$test_file1" + NOT "${DD_APP[@]}" --if="$test_file0" --of="$test_file1_link" --oflag=nofollow + + # Do an extra step of checking if we actually can follow symlinks + gen_bytes 512 > "$test_file0" + + "${DD_APP[@]}" --if="$test_file0_link" --of="$test_file1" + [[ $(< "$test_file0") == "$(< "$test_file1")" ]] +} + +noatime() { + local atime_if + local atime_of + + # It seems like spdk_dd doesn't update the atime in case 0 bytes are copied. + # This differs from how standard dd works for instance + gen_bytes 512 > "$test_file0" + + atime_if=$(stat --printf="%X" "$test_file0") + atime_of=$(stat --printf="%X" "$test_file1") + + "${DD_APP[@]}" --if="$test_file0" --iflag=noatime --of="$test_file1" + ((atime_if == $(stat --printf="%X" "$test_file0"))) + ((atime_of == $(stat --printf="%X" "$test_file1"))) + + "${DD_APP[@]}" --if="$test_file0" --of="$test_file1" + ((atime_if < $(stat --printf="%X" "$test_file0"))) +} + +io() { + local flags_ro flags_rw flag_ro flag_rw + + # O_NONBLOCK is actually a no-op, from a functional perspective, while + # open()ing a regular file, but let's keep it just to test its usage. + flags_ro=(direct nonblock) + flags_rw=("${flags_ro[@]}" sync dsync) + + # simply check if data was correctly copied between files + for flag_ro in "${flags_ro[@]}"; do + gen_bytes 512 > "$test_file0" + for flag_rw in "${flags_rw[@]}"; do + "${DD_APP[@]}" \ + --if="$test_file0" \ + --iflag="$flag_ro" \ + --of="$test_file1" \ + --oflag="$flag_rw" + [[ $(< "$test_file0") == "$(< "$test_file1")" ]] + done + done +} + +tests() { + printf '* First test run%s\n' \ + "${msg[liburing_in_use]}" >&2 + + run_test "dd_flag_append" append + run_test "dd_flag_directory" directory + run_test "dd_flag_nofollow" nofollow + run_test "dd_flag_noatime" noatime + run_test "dd_flags_misc" io +} + +tests_forced_aio() { + printf '* Second test run%s\n' \ + "${msg[liburing_in_use ? 2 : 0]}" >&2 + + DD_APP+=("--aio") + run_test "dd_flag_append_forced_aio" append + run_test "dd_flag_directory_forced_aio" directory + run_test "dd_flag_nofollow_forced_aio" nofollow + run_test "dd_flag_noatime_forced_aio" noatime + run_test "dd_flags_misc_forced_aio" io +} + +msg[0]=", using AIO" +msg[1]=", liburing in use" +msg[2]=", disabling liburing, forcing AIO" + +trap "cleanup" EXIT + +test_file0=$SPDK_TEST_STORAGE/dd.dump0 +test_file1=$SPDK_TEST_STORAGE/dd.dump1 + +tests +tests_forced_aio diff --git a/test/env/mem_callbacks/mem_callbacks.c b/test/env/mem_callbacks/mem_callbacks.c index accdedbcd1b..165ddb3d8e3 100644 --- a/test/env/mem_callbacks/mem_callbacks.c +++ b/test/env/mem_callbacks/mem_callbacks.c @@ -43,16 +43,6 @@ #include #include -#if RTE_VERSION < RTE_VERSION_NUM(18, 05, 0, 0) - -static void -test(void) -{ - printf("DPDK version %s does not support memory callbacks\n", rte_version()); -} - -#else - struct mem_allocation { uintptr_t vaddr; size_t len; @@ -183,8 +173,6 @@ test(void) rte_free(buf4); } -#endif - int main(int argc, char **argv) { diff --git a/test/env/pci/pci_ut.c b/test/env/pci/pci_ut.c index 8287e78ac54..66d36b98063 100644 --- a/test/env/pci/pci_ut.c +++ b/test/env/pci/pci_ut.c @@ -59,9 +59,7 @@ pci_claim_test(struct spdk_pci_device *dev) } } -static struct spdk_pci_driver ut_pci_driver = { - .is_registered = true, -}; +static struct spdk_pci_driver ut_pci_driver; struct ut_pci_dev { struct spdk_pci_device pci; @@ -128,14 +126,6 @@ ut_enum_cb(void *ctx, struct spdk_pci_device *dev) return 0; } -static void -ut_detach(struct spdk_pci_device *dev) -{ - struct ut_pci_dev *ut_dev = (struct ut_pci_dev *)dev; - - ut_dev->attached = false; -} - static void pci_hook_test(void) { @@ -145,6 +135,7 @@ pci_hook_test(void) uint64_t bar0_paddr, bar0_size; int rc; + ut_dev.pci.type = "custom"; ut_dev.pci.id.vendor_id = 0x4; ut_dev.pci.id.device_id = 0x8; @@ -159,7 +150,6 @@ pci_hook_test(void) ut_dev.pci.unmap_bar = ut_unmap_bar; ut_dev.pci.cfg_read = ut_cfg_read; ut_dev.pci.cfg_write = ut_cfg_write; - ut_dev.pci.detach = ut_detach; /* hook the device into the PCI layer */ spdk_pci_hook_device(&ut_pci_driver, &ut_dev.pci); @@ -207,9 +197,7 @@ pci_hook_test(void) /* test spdk_pci_device_claim() */ pci_claim_test(&ut_dev.pci); - /* detach and verify our callback was called */ spdk_pci_device_detach(&ut_dev.pci); - CU_ASSERT(!ut_dev.attached); CU_ASSERT(!ut_dev.pci.internal.attached); /* unhook the device */ diff --git a/test/env/vtophys/Makefile b/test/env/vtophys/Makefile index 68c4632a3aa..4b881e0123d 100644 --- a/test/env/vtophys/Makefile +++ b/test/env/vtophys/Makefile @@ -32,8 +32,13 @@ # SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk -UNIT_TEST_LINK_ENV = 1 -TEST_FILE = vtophys.c +APP = vtophys +C_SRCS := vtophys.c +SPDK_LIB_LIST += thread util log -include $(SPDK_ROOT_DIR)/mk/spdk.unittest.mk +SYS_LIBS += -lcunit + +include $(SPDK_ROOT_DIR)/mk/spdk.app.mk diff --git a/test/event/Makefile b/test/event/Makefile index b3e9cf1b05e..4b9cab8677b 100644 --- a/test/event/Makefile +++ b/test/event/Makefile @@ -36,6 +36,10 @@ include $(SPDK_ROOT_DIR)/mk/spdk.common.mk DIRS-y = event_perf reactor reactor_perf +ifeq ($(OS),Linux) +DIRS-y += app_repeat +endif + .PHONY: all clean $(DIRS-y) all: $(DIRS-y) diff --git a/test/event/app_repeat/.gitignore b/test/event/app_repeat/.gitignore new file mode 100644 index 00000000000..0e59ff47d7f --- /dev/null +++ b/test/event/app_repeat/.gitignore @@ -0,0 +1 @@ +app_repeat diff --git a/test/event/app_repeat/Makefile b/test/event/app_repeat/Makefile new file mode 100644 index 00000000000..eb5140b1cff --- /dev/null +++ b/test/event/app_repeat/Makefile @@ -0,0 +1,54 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk + +APP = app_repeat +C_SRCS := app_repeat.c + +# Some of the modules and libaries are not repeatable yet, only organize +# the repeatable ones. +SPDK_LIB_LIST = event_bdev event_accel event_vmd event_sock +SPDK_LIB_LIST += event log trace conf thread util bdev accel rpc jsonrpc json sock vmd +SPDK_LIB_LIST += app_rpc log_rpc bdev_rpc notify +SPDK_LIB_LIST += event_nbd nbd + +BLOCKDEV_LIST = bdev_malloc bdev_null +BLOCKDEV_LIST += bdev_aio +SYS_LIBS += -laio + +SPDK_LIB_LIST += $(BLOCKDEV_LIST) + +include $(SPDK_ROOT_DIR)/mk/spdk.app.mk diff --git a/test/event/app_repeat/app_repeat.c b/test/event/app_repeat/app_repeat.c new file mode 100644 index 00000000000..d83e7949cd1 --- /dev/null +++ b/test/event/app_repeat/app_repeat.c @@ -0,0 +1,115 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/event.h" +#include "spdk/string.h" +#include "spdk/thread.h" + +struct spdk_app_opts g_opts = {}; +static const char g_app_repeat_get_opts_string[] = "t:"; +static int g_repeat_times = 2; +static bool g_exit; + +static void +app_repeat_usage(void) +{ + printf(" -t number of times to repeat calling spdk_app_start/stop\n"); +} + +static int +app_repeat_parse_arg(int ch, char *arg) +{ + switch (ch) { + case 't': + g_repeat_times = spdk_strtol(arg, 0); + if (g_repeat_times < 2) { + return -EINVAL; + } + break; + default: + return -EINVAL; + } + return 0; +} + +static void +app_repeat_started(void *arg1) +{ + int index = *(int *)arg1; + + printf("spdk_app_start is called in Round %d.\n", index); +} + +static void _app_repeat_shutdown_cb(void) +{ + printf("Shutdown signal received, exit.\n"); + g_exit = true; + spdk_app_stop(0); +} + +static void _app_repeat_usr1_handler(int signal) +{ + printf("USR1 signal received, restart spdk application framework.\n"); + spdk_app_stop(0); +} + +int +main(int argc, char **argv) +{ + int rc; + int i; + + spdk_app_opts_init(&g_opts); + g_opts.name = "app_repeat"; + g_opts.shutdown_cb = _app_repeat_shutdown_cb; + g_opts.usr1_handler = _app_repeat_usr1_handler; + if ((rc = spdk_app_parse_args(argc, argv, &g_opts, g_app_repeat_get_opts_string, + NULL, app_repeat_parse_arg, app_repeat_usage)) != + SPDK_APP_PARSE_ARGS_SUCCESS) { + return rc; + } + + for (i = 0; i < g_repeat_times; i++) { + rc = spdk_app_start(&g_opts, app_repeat_started, &i); + spdk_app_fini(); + + if (rc) { + fprintf(stderr, "Failed to call spdk_app_start in Round %d.\n", i); + break; + } + } + + return rc; +} diff --git a/test/event/event.sh b/test/event/event.sh index fe023edb7a9..d198cd116ff 100755 --- a/test/event/event.sh +++ b/test/event/event.sh @@ -3,7 +3,42 @@ testdir=$(readlink -f $(dirname $0)) rootdir=$(readlink -f $testdir/../..) source $rootdir/test/common/autotest_common.sh +source $rootdir/test/bdev/nbd_common.sh + +function app_repeat_test() { + local rpc_server=/var/tmp/spdk-nbd.sock + local nbd_list=("/dev/nbd0" "/dev/nbd1") + local bdev_list=("Malloc0" "Malloc1") + local repeat_times=4 + + modprobe nbd + $rootdir/test/event/app_repeat/app_repeat -r $rpc_server -m 0x3 -t $repeat_times & + repeat_pid=$! + trap 'killprocess $repeat_pid; exit 1' SIGINT SIGTERM EXIT + echo "Process app_repeat pid: $repeat_pid" + + for i in {0..2}; do + echo "spdk_app_start Round $i" + waitforlisten $repeat_pid $rpc_server + + $rootdir/scripts/rpc.py -s $rpc_server bdev_malloc_create 64 4096 + $rootdir/scripts/rpc.py -s $rpc_server bdev_malloc_create 64 4096 + + nbd_rpc_data_verify $rpc_server "${bdev_list[*]}" "${nbd_list[*]}" + ./scripts/rpc.py -s $rpc_server spdk_kill_instance SIGUSR1 + done + + waitforlisten $repeat_pid $rpc_server + killprocess $repeat_pid + trap - SIGINT SIGTERM EXIT + + return 0 +} run_test "event_perf" $testdir/event_perf/event_perf -m 0xF -t 1 run_test "event_reactor" $testdir/reactor/reactor -t 1 run_test "event_reactor_perf" $testdir/reactor_perf/reactor_perf -t 1 + +if [ $(uname -s) = Linux ] && modprobe -n nbd; then + run_test "app_repeat" app_repeat_test +fi diff --git a/test/external_code/hello_world/Makefile b/test/external_code/hello_world/Makefile index 224d5cc4d3a..9f6c9cf30a5 100644 --- a/test/external_code/hello_world/Makefile +++ b/test/external_code/hello_world/Makefile @@ -31,36 +31,43 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# Shows how to compile both an external bdev and an external application against the SPDK combined shared object and dpdk shared object. +# Shows how to compile both an external bdev and an external application against the SPDK combined shared object and dpdk shared objects. bdev_shared_combo: - $(CC) $(COMMON_CFLAGS) -L../passthru -o hello_bdev ./hello_bdev.c -lpassthru_external -lspdk -lspdk_env_dpdk -ldpdk + $(CC) $(COMMON_CFLAGS) -L../passthru -Wl,-rpath=$(SPDK_LIB_DIR),--no-as-needed -o hello_bdev ./hello_bdev.c -lpassthru_external \ + -lspdk -lspdk_env_dpdk -lrte_eal -lrte_mempool -lrte_ring -lrte_mbuf -lrte_mempool_ring -lrte_pci -lrte_bus_pci -lrte_kvargs \ + -lrte_vhost -lrte_net -lrte_hash -lrte_cryptodev -Wl,--no-whole-archive -# Shows how to compile both an external bdev and an external application against the SPDK individual shared objects and dpdk shared object. +# Shows how to compile both an external bdev and an external application against the SPDK individual shared objects and dpdk shared objects. bdev_shared_iso: $(CC) $(COMMON_CFLAGS) -L../passthru -Wl,-rpath=$(SPDK_LIB_DIR),--no-as-needed -o hello_bdev ./hello_bdev.c \ - -lpassthru_external -lspdk_event_bdev -lspdk_bdev -lspdk_bdev_malloc -lspdk_log -lspdk_thread -lspdk_util -lspdk_event -lspdk_env_dpdk -ldpdk + -lpassthru_external -lspdk_event_bdev -lspdk_event_accel -lspdk_event_vmd -lspdk_bdev -lspdk_bdev_malloc -lspdk_log -lspdk_thread -lspdk_util -lspdk_event \ + -lspdk_env_dpdk -lrte_eal -lrte_mempool -lrte_ring -lrte_mbuf -lrte_mempool_ring -lrte_pci -lrte_bus_pci -lrte_kvargs \ + -lrte_vhost -lrte_net -lrte_hash -lrte_cryptodev -Wl,--no-whole-archive -lnuma -# Shows how to compile an external application against the SPDK combined shared object and dpdk shared object. +# Shows how to compile an external application against the SPDK combined shared object and dpdk shared objects. alone_shared_combo: - $(CC) $(COMMON_CFLAGS) -o hello_bdev ./hello_bdev.c -lspdk -lspdk_env_dpdk -ldpdk + $(CC) $(COMMON_CFLAGS) -Wl,-rpath=$(SPDK_LIB_DIR),--no-as-needed -o hello_bdev ./hello_bdev.c -lspdk -lspdk_env_dpdk -lrte_eal \ + -lrte_mempool -lrte_ring -lrte_mbuf -lrte_mempool_ring -lrte_pci -lrte_bus_pci -lrte_kvargs -lrte_vhost -lrte_net -lrte_hash -lrte_cryptodev -# Shows how to compile an external application against the SPDK individual shared objects and dpdk shared object. +# Shows how to compile an external application against the SPDK individual shared objects and dpdk shared objects. alone_shared_iso: $(CC) $(COMMON_CFLAGS) -Wl,-rpath=$(SPDK_LIB_DIR),--no-as-needed -o hello_bdev ./hello_bdev.c -lspdk_event_bdev \ - -lspdk_bdev -lspdk_bdev_malloc -lspdk_log -lspdk_thread -lspdk_util -lspdk_event -lspdk_env_dpdk -ldpdk + -lspdk_event_accel -lspdk_event_vmd -lspdk_bdev -lspdk_bdev_malloc -lspdk_log -lspdk_thread -lspdk_util -lspdk_event \ + -lspdk_env_dpdk -lrte_eal -lrte_mempool -lrte_ring -lrte_mbuf -lrte_mempool_ring -lrte_pci -lrte_bus_pci -lrte_kvargs \ + -lrte_vhost -lrte_net -lrte_hash -lrte_cryptodev # Shows how to compile an external application against the SPDK archives. alone_static: - $(CC) $(COMMON_CFLAGS) -o hello_bdev ./hello_bdev.c -Wl,--whole-archive -lspdk_bdev_malloc -lspdk_event_bdev -lspdk_event_accel -lspdk_event_vmd \ - -lspdk_bdev -lspdk_accel -lspdk_event -lspdk_thread -lspdk_util -lspdk_conf -lspdk_trace -lspdk_log -lspdk_json \ + $(CC) $(COMMON_CFLAGS) -o hello_bdev ./hello_bdev.c -Wl,--whole-archive,-Bstatic -lspdk_bdev_malloc -lspdk_event_bdev -lspdk_event_accel -lspdk_event_vmd \ + -lspdk_event_sock -lspdk_bdev -lspdk_accel -lspdk_event -lspdk_thread -lspdk_util -lspdk_conf -lspdk_trace -lspdk_log -lspdk_json \ -lspdk_jsonrpc -lspdk_rpc -lspdk_sock -lspdk_notify -lspdk_vmd -lspdk_env_dpdk -lrte_eal -lrte_mempool -lrte_ring \ - -lrte_mbuf -lrte_mempool_ring -lrte_pci -lrte_bus_pci -lrte_kvargs -lrte_vhost -lrte_net -lrte_hash \ - -lrte_cryptodev -Wl,--no-whole-archive -lnuma -luuid -lpthread -ldl -lrt + -lrte_mbuf -lrte_mempool_ring -lrte_pci -lrte_bus_pci -lrte_kvargs -lrte_vhost -lrte_net -lrte_hash -lrte_telemetry \ + -lrte_cryptodev -Wl,--no-whole-archive,-Bdynamic -lnuma -luuid -lpthread -ldl -lrt # Shows how to compile and external bdev and application sgainst the SPDK archives. bdev_static: - $(CC) $(COMMON_CFLAGS) -L../passthru -o hello_bdev ./hello_bdev.c -Wl,--whole-archive -lpassthru_external -lspdk_bdev_malloc -lspdk_event_bdev \ - -lspdk_event_accel -lspdk_event_vmd -lspdk_bdev -lspdk_accel -lspdk_event -lspdk_thread -lspdk_util -lspdk_conf -lspdk_trace \ + $(CC) $(COMMON_CFLAGS) -L../passthru -o hello_bdev ./hello_bdev.c -Wl,--whole-archive,-Bstatic -lpassthru_external -lspdk_bdev_malloc -lspdk_event_bdev \ + -lspdk_event_accel -lspdk_event_vmd -lspdk_event_sock -lspdk_bdev -lspdk_accel -lspdk_event -lspdk_thread -lspdk_util -lspdk_conf -lspdk_trace \ -lspdk_log -lspdk_json -lspdk_jsonrpc -lspdk_rpc -lspdk_sock -lspdk_notify -lspdk_vmd -lspdk_env_dpdk -lrte_eal -lrte_mempool \ - -lrte_ring -lrte_mbuf -lrte_mempool_ring -lrte_pci -lrte_bus_pci -lrte_kvargs -lrte_vhost -lrte_net -lrte_hash -lrte_cryptodev \ - -Wl,--no-whole-archive -lnuma -luuid -lpthread -ldl -lrt + -lrte_ring -lrte_mbuf -lrte_mempool_ring -lrte_pci -lrte_bus_pci -lrte_kvargs -lrte_vhost -lrte_net -lrte_hash -lrte_telemetry -lrte_cryptodev \ + -Wl,--no-whole-archive,-Bdynamic -lnuma -luuid -lpthread -ldl -lrt diff --git a/test/external_code/test_make.sh b/test/external_code/test_make.sh index fdf291de19d..c5ca70c51a4 100755 --- a/test/external_code/test_make.sh +++ b/test/external_code/test_make.sh @@ -16,33 +16,30 @@ make -C $SPDK_DIR -j$(nproc) export SPDK_HEADER_DIR="$SPDK_DIR/include" export SPDK_LIB_DIR="$SPDK_DIR/build/lib" -export DPDK_LIB_DIR="$SPDK_DIR/dpdk/build/lib" +export DPDK_LIB_DIR="${SPDK_RUN_EXTERNAL_DPDK:-$SPDK_DIR/dpdk/build}/lib" +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$SPDK_LIB_DIR:$DPDK_LIB_DIR:"$test_root/passthru" # The default target is to make both the app and bdev and link them against the combined SPDK shared library libspdk.so. run_test "external_make_tc1" make -C $test_root hello_world_bdev_shared_combo - -LD_LIBRARY_PATH=$SPDK_LIB_DIR:$DPDK_LIB_DIR:"$test_root/passthru" run_test "external_run_tc1" $test_root/hello_world/hello_bdev --json $test_root/hello_world/bdev_external.conf -b TestPT +run_test "external_run_tc1" $test_root/hello_world/hello_bdev --json $test_root/hello_world/bdev_external.conf -b TestPT make -C $test_root clean # Make just the application linked against the combined SPDK shared library libspdk.so. run_test "external_make_tc2" make -C $test_root hello_world_no_bdev_shared_combo - -LD_LIBRARY_PATH=$SPDK_LIB_DIR:$DPDK_LIB_DIR run_test "external_run_tc2" $test_root/hello_world/hello_bdev --json $test_root/hello_world/bdev.conf -b Malloc0 +run_test "external_run_tc2" $test_root/hello_world/hello_bdev --json $test_root/hello_world/bdev.conf -b Malloc0 make -C $test_root clean # Make both the application and bdev against individual SPDK shared libraries. run_test "external_make_tc3" make -C $test_root hello_world_bdev_shared_iso - -LD_LIBRARY_PATH=$SPDK_LIB_DIR:$DPDK_LIB_DIR:"$test_root/passthru" run_test "external_run_tc3" $test_root/hello_world/hello_bdev --json $test_root/hello_world/bdev_external.conf -b TestPT +run_test "external_run_tc3" $test_root/hello_world/hello_bdev --json $test_root/hello_world/bdev_external.conf -b TestPT make -C $test_root clean # Make just the application linked against individual SPDK shared libraries. run_test "external_make_tc4" make -C $test_root hello_world_no_bdev_shared_iso - -LD_LIBRARY_PATH=$SPDK_LIB_DIR:$DPDK_LIB_DIR run_test "external_run_tc4" $test_root/hello_world/hello_bdev --json $test_root/hello_world/bdev.conf -b Malloc0 +run_test "external_run_tc4" $test_root/hello_world/hello_bdev --json $test_root/hello_world/bdev.conf -b Malloc0 make -C $test_root clean @@ -52,15 +49,13 @@ make -C $SPDK_DIR -j$(nproc) # Make both the application and bdev against individual SPDK archives. run_test "external_make_tc5" make -C $test_root hello_world_bdev_static - -LD_LIBRARY_PATH=$SPDK_LIB_DIR:$DPDK_LIB_DIR:"$test_root/passthru" run_test "external_run_tc5" $test_root/hello_world/hello_bdev --json $test_root/hello_world/bdev_external.conf -b TestPT +run_test "external_run_tc5" $test_root/hello_world/hello_bdev --json $test_root/hello_world/bdev_external.conf -b TestPT make -C $test_root clean # Make just the application linked against individual SPDK archives. run_test "external_make_tc6" make -C $test_root hello_world_no_bdev_static - -LD_LIBRARY_PATH=$SPDK_LIB_DIR:$DPDK_LIB_DIR run_test "external_run_tc6" $test_root/hello_world/hello_bdev --json $test_root/hello_world/bdev.conf -b Malloc0 +run_test "external_run_tc6" $test_root/hello_world/hello_bdev --json $test_root/hello_world/bdev.conf -b Malloc0 make -C $test_root clean make -C $SPDK_DIR -j$(nproc) clean diff --git a/test/ftl/ftl.sh b/test/ftl/ftl.sh index e0c08bbebe4..b432bdfb0b7 100755 --- a/test/ftl/ftl.sh +++ b/test/ftl/ftl.sh @@ -9,7 +9,7 @@ rpc_py=$rootdir/scripts/rpc.py function at_ftl_exit() { # restore original driver - PCI_WHITELIST="$device" PCI_BLACKLIST="" DRIVER_OVERRIDE="$ocssd_original_dirver" ./scripts/setup.sh + PCI_WHITELIST="$device" PCI_BLACKLIST="" DRIVER_OVERRIDE="$ocssd_original_dirver" $rootdir/scripts/setup.sh } read -r device _ <<< "$OCSSD_PCI_DEVICES" @@ -27,7 +27,7 @@ ocssd_original_dirver="$(basename $(readlink /sys/bus/pci/devices/$device/driver trap 'at_ftl_exit' SIGINT SIGTERM EXIT # OCSSD is blacklisted so bind it to vfio/uio driver before testing -PCI_WHITELIST="$device" PCI_BLACKLIST="" DRIVER_OVERRIDE="" ./scripts/setup.sh +PCI_WHITELIST="$device" PCI_BLACKLIST="" DRIVER_OVERRIDE="" $rootdir/scripts/setup.sh # Use first regular NVMe disk (non-OC) as non-volatile cache nvme_disks=$($rootdir/scripts/gen_nvme.sh --json | jq -r \ diff --git a/test/fuzz/autofuzz_iscsi.sh b/test/fuzz/autofuzz_iscsi.sh index 8793e8bf1d0..9748e07ec4a 100755 --- a/test/fuzz/autofuzz_iscsi.sh +++ b/test/fuzz/autofuzz_iscsi.sh @@ -5,9 +5,7 @@ rootdir=$(readlink -f $testdir/../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit if [ -z "$TARGET_IP" ]; then echo "TARGET_IP not defined in environment" @@ -57,7 +55,7 @@ $rpc_py bdev_malloc_create $MALLOC_BDEV_SIZE $MALLOC_BLOCK_SIZE $rpc_py iscsi_create_target_node disk1 disk1_alias 'Malloc0:0' $PORTAL_TAG:$INITIATOR_TAG 256 -d sleep 1 -trap 'killprocess $iscsipid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'killprocess $iscsipid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT $rootdir/test/app/fuzz/iscsi_fuzz/iscsi_fuzz -m 0xF0 -T $TARGET_IP -t $TEST_TIMEOUT 2> $output_dir/iscsi_autofuzz_logs.txt @@ -70,6 +68,6 @@ trap - SIGINT SIGTERM EXIT killprocess $iscsipid -iscsitestfini $1 $2 +iscsitestfini timing_exit iscsi_fuzz_test diff --git a/test/fuzz/autofuzz_vhost.sh b/test/fuzz/autofuzz_vhost.sh index 4b040ba82f5..94def21eff9 100755 --- a/test/fuzz/autofuzz_vhost.sh +++ b/test/fuzz/autofuzz_vhost.sh @@ -8,7 +8,7 @@ TEST_TIMEOUT=1200 VHOST_APP+=(-p 0) FUZZ_RPC_SOCK="/var/tmp/spdk_fuzz.sock" -VHOST_FUZZ_APP+=(-r "$FUZZ_RPC_SOCK" --wait-for-rpc) +VHOST_FUZZ_APP+=(-r "$FUZZ_RPC_SOCK" --wait-for-rpc -g) vhost_rpc_py="$rootdir/scripts/rpc.py" fuzz_generic_rpc_py="$rootdir/scripts/rpc.py -s $FUZZ_RPC_SOCK" diff --git a/test/iscsi_tgt/bdev_io_wait/bdev_io_wait.sh b/test/iscsi_tgt/bdev_io_wait/bdev_io_wait.sh index 15dfe1165e9..9e78e259ebd 100755 --- a/test/iscsi_tgt/bdev_io_wait/bdev_io_wait.sh +++ b/test/iscsi_tgt/bdev_io_wait/bdev_io_wait.sh @@ -5,7 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -iscsitestinit $1 $2 +iscsitestinit MALLOC_BDEV_SIZE=64 MALLOC_BLOCK_SIZE=512 @@ -17,7 +17,7 @@ timing_enter start_iscsi_tgt "${ISCSI_APP[@]}" -m 0x2 -p 1 -s 512 --wait-for-rpc & pid=$! echo "iSCSI target launched. pid: $pid" -trap 'killprocess $pid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'killprocess $pid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT waitforlisten $pid $rpc_py iscsi_set_options -o 30 -a 4 # Minimal number of bdev io pool (5) and cache (1) @@ -36,7 +36,7 @@ $rpc_py bdev_malloc_create $MALLOC_BDEV_SIZE $MALLOC_BLOCK_SIZE # "-d" ==> disable CHAP authentication $rpc_py iscsi_create_target_node disk1 disk1_alias 'Malloc0:0' $PORTAL_TAG:$INITIATOR_TAG 256 -d sleep 1 -trap 'killprocess $pid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'killprocess $pid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT "$rootdir/test/bdev/bdevperf/bdevperf" --json <(initiator_json_config) -q 128 -o 4096 -w write -t 1 "$rootdir/test/bdev/bdevperf/bdevperf" --json <(initiator_json_config) -q 128 -o 4096 -w read -t 1 @@ -47,4 +47,4 @@ trap - SIGINT SIGTERM EXIT killprocess $pid -iscsitestfini $1 $2 +iscsitestfini diff --git a/test/iscsi_tgt/common.sh b/test/iscsi_tgt/common.sh index d42a2a3a2fc..7233b099dcc 100644 --- a/test/iscsi_tgt/common.sh +++ b/test/iscsi_tgt/common.sh @@ -1,11 +1,19 @@ # Network configuration -TARGET_INTERFACE="spdk_tgt_int" +# There is one initiator interface and it is accessed directly. +# There are two target interfaces and they are accessed through an namespace. +ISCSI_BRIDGE="iscsi_br" INITIATOR_INTERFACE="spdk_init_int" +INITIATOR_BRIDGE="init_br" TARGET_NAMESPACE="spdk_iscsi_ns" TARGET_NS_CMD=(ip netns exec "$TARGET_NAMESPACE") +TARGET_INTERFACE="spdk_tgt_int" +TARGET_INTERFACE2="spdk_tgt_int2" +TARGET_BRIDGE="tgt_br" +TARGET_BRIDGE2="tgt_br2" # iSCSI target configuration TARGET_IP=10.0.0.1 +TARGET_IP2=10.0.0.3 INITIATOR_IP=10.0.0.2 ISCSI_PORT=3260 NETMASK=$INITIATOR_IP/32 @@ -13,65 +21,88 @@ INITIATOR_TAG=2 INITIATOR_NAME=ANY PORTAL_TAG=1 ISCSI_APP=("${TARGET_NS_CMD[@]}" "${ISCSI_APP[@]}") -if [ $SPDK_TEST_VPP -eq 1 ]; then - ISCSI_APP+=(-L sock_vpp) -fi ISCSI_TEST_CORE_MASK=0xFF function create_veth_interfaces() { - # $1 = test type (posix/vpp) - ip netns del $TARGET_NAMESPACE || true + ip link set $INITIATOR_BRIDGE nomaster || true + ip link set $TARGET_BRIDGE nomaster || true + ip link set $TARGET_BRIDGE2 nomaster || true + ip link set $INITIATOR_BRIDGE down || true + ip link set $TARGET_BRIDGE down || true + ip link set $TARGET_BRIDGE2 down || true + ip link delete $ISCSI_BRIDGE type bridge || true ip link delete $INITIATOR_INTERFACE || true + "${TARGET_NS_CMD[@]}" ip link delete $TARGET_INTERFACE || true + "${TARGET_NS_CMD[@]}" ip link delete $TARGET_INTERFACE2 || true + ip netns del $TARGET_NAMESPACE || true - trap 'cleanup_veth_interfaces $1; exit 1' SIGINT SIGTERM EXIT + trap 'cleanup_veth_interfaces; exit 1' SIGINT SIGTERM EXIT - # Create veth (Virtual ethernet) interface pair - ip link add $INITIATOR_INTERFACE type veth peer name $TARGET_INTERFACE - ip addr add $INITIATOR_IP/24 dev $INITIATOR_INTERFACE - ip link set $INITIATOR_INTERFACE up - - # Create and add interface for target to network namespace + # Create network namespace ip netns add $TARGET_NAMESPACE + + # Create veth (Virtual ethernet) interface pairs + ip link add $INITIATOR_INTERFACE type veth peer name $INITIATOR_BRIDGE + ip link add $TARGET_INTERFACE type veth peer name $TARGET_BRIDGE + ip link add $TARGET_INTERFACE2 type veth peer name $TARGET_BRIDGE2 + + # Associate veth interface pairs with network namespace ip link set $TARGET_INTERFACE netns $TARGET_NAMESPACE + ip link set $TARGET_INTERFACE2 netns $TARGET_NAMESPACE - # Accept connections from veth interface - iptables -I INPUT 1 -i $INITIATOR_INTERFACE -p tcp --dport $ISCSI_PORT -j ACCEPT + # Allocate IP addresses + ip addr add $INITIATOR_IP/24 dev $INITIATOR_INTERFACE + "${TARGET_NS_CMD[@]}" ip addr add $TARGET_IP/24 dev $TARGET_INTERFACE + "${TARGET_NS_CMD[@]}" ip addr add $TARGET_IP2/24 dev $TARGET_INTERFACE2 + # Link up veth interfaces + ip link set $INITIATOR_INTERFACE up + ip link set $INITIATOR_BRIDGE up + ip link set $TARGET_BRIDGE up + ip link set $TARGET_BRIDGE2 up "${TARGET_NS_CMD[@]}" ip link set $TARGET_INTERFACE up + "${TARGET_NS_CMD[@]}" ip link set $TARGET_INTERFACE2 up + "${TARGET_NS_CMD[@]}" ip link set lo up - if [ "$1" == "posix" ]; then - "${TARGET_NS_CMD[@]}" ip link set lo up - "${TARGET_NS_CMD[@]}" ip addr add $TARGET_IP/24 dev $TARGET_INTERFACE + # Create a bridge + ip link add $ISCSI_BRIDGE type bridge + ip link set $ISCSI_BRIDGE up - # Verify connectivity - ping -c 1 $TARGET_IP - ip netns exec $TARGET_NAMESPACE ping -c 1 $INITIATOR_IP - else - start_vpp - fi + # Add veth interfaces to the bridge + ip link set $INITIATOR_BRIDGE master $ISCSI_BRIDGE + ip link set $TARGET_BRIDGE master $ISCSI_BRIDGE + ip link set $TARGET_BRIDGE2 master $ISCSI_BRIDGE + + # Accept connections from veth interface + iptables -I INPUT 1 -i $INITIATOR_INTERFACE -p tcp --dport $ISCSI_PORT -j ACCEPT + + # Verify connectivity + ping -c 1 $TARGET_IP + ping -c 1 $TARGET_IP2 + "${TARGET_NS_CMD[@]}" ping -c 1 $INITIATOR_IP + "${TARGET_NS_CMD[@]}" ping -c 1 $INITIATOR_IP } function cleanup_veth_interfaces() { - # $1 = test type (posix/vpp) - if [ "$1" == "vpp" ]; then - kill_vpp - fi - - # Cleanup veth interfaces and network namespace + # Cleanup bridge, veth interfaces, and network namespace # Note: removing one veth, removes the pair + ip link set $INITIATOR_BRIDGE nomaster + ip link set $TARGET_BRIDGE nomaster + ip link set $TARGET_BRIDGE2 nomaster + ip link set $INITIATOR_BRIDGE down + ip link set $TARGET_BRIDGE down + ip link set $TARGET_BRIDGE2 down + ip link delete $ISCSI_BRIDGE type bridge ip link delete $INITIATOR_INTERFACE + "${TARGET_NS_CMD[@]}" ip link delete $TARGET_INTERFACE + "${TARGET_NS_CMD[@]}" ip link delete $TARGET_INTERFACE2 ip netns del $TARGET_NAMESPACE } function iscsitestinit() { - if [ "$1" == "iso" ]; then + if [ "$TEST_MODE" == "iso" ]; then $rootdir/scripts/setup.sh - if [ -n "$2" ]; then - create_veth_interfaces $2 - else - # default to posix - create_veth_interfaces "posix" - fi + create_veth_interfaces fi } @@ -91,100 +122,12 @@ function waitforiscsidevices() { } function iscsitestfini() { - if [ "$1" == "iso" ]; then - if [ -n "$2" ]; then - cleanup_veth_interfaces $2 - else - # default to posix - cleanup_veth_interfaces "posix" - fi + if [ "$TEST_MODE" == "iso" ]; then + cleanup_veth_interfaces $rootdir/scripts/setup.sh reset fi } -function start_vpp() { - # We need to make sure that posix side doesn't send jumbo packets while - # for VPP side maximal size of MTU for TCP is 1460 and tests doesn't work - # stable with larger packets - MTU=1460 - MTU_W_HEADER=$((MTU + 20)) - ip link set dev $INITIATOR_INTERFACE mtu $MTU - ethtool -K $INITIATOR_INTERFACE tso off - ethtool -k $INITIATOR_INTERFACE - - # Start VPP process in SPDK target network namespace - "${TARGET_NS_CMD[@]}" vpp \ - unix { nodaemon cli-listen /run/vpp/cli.sock } \ - dpdk { no-pci } \ - session { evt_qs_memfd_seg } \ - socksvr { socket-name /run/vpp-api.sock } \ - plugins { \ - plugin default { disable } \ - plugin dpdk_plugin.so { enable } \ - } & - - vpp_pid=$! - echo "VPP Process pid: $vpp_pid" - - gdb_attach $vpp_pid & - - # Wait until VPP starts responding - xtrace_disable - counter=40 - while [ $counter -gt 0 ]; do - vppctl show version | grep -E "vpp v[0-9]+\.[0-9]+" && break - counter=$((counter - 1)) - sleep 0.5 - done - xtrace_restore - if [ $counter -eq 0 ]; then - return 1 - fi - - # Below VPP commands are masked with "|| true" for the sake of - # running the test in the CI system. For reasons unknown when - # run via CI these commands result in 141 return code (pipefail) - # even despite producing valid output. - # Using "|| true" does not impact the "-e" flag used in test scripts - # because vppctl cli commands always return with 0, even if - # there was an error. - # As a result - grep checks on command outputs must be used to - # verify vpp configuration and connectivity. - - # Setup host interface - vppctl create host-interface name $TARGET_INTERFACE || true - VPP_TGT_INT="host-$TARGET_INTERFACE" - vppctl set interface state $VPP_TGT_INT up || true - vppctl set interface ip address $VPP_TGT_INT $TARGET_IP/24 || true - vppctl set interface mtu $MTU $VPP_TGT_INT || true - - vppctl show interface | tr -s " " | grep -E "host-$TARGET_INTERFACE [0-9]+ up $MTU/0/0/0" - - # Disable session layer - # NOTE: VPP net framework should enable it itself. - vppctl session disable || true - - # Verify connectivity - vppctl show int addr | grep -E "$TARGET_IP/24" - ip addr show $INITIATOR_INTERFACE - ip netns exec $TARGET_NAMESPACE ip addr show $TARGET_INTERFACE - sleep 3 - # SC1010: ping -M do - in this case do is an option not bash special word - # shellcheck disable=SC1010 - ping -c 1 $TARGET_IP -s $((MTU - 28)) -M do - vppctl ping $INITIATOR_IP repeat 1 size $((MTU - (28 + 8))) verbose | grep -E "$MTU_W_HEADER bytes from $INITIATOR_IP" -} - -function kill_vpp() { - vppctl delete host-interface name $TARGET_INTERFACE || true - - # Dump VPP configuration before kill - vppctl show api clients || true - vppctl show session || true - vppctl show errors || true - - killprocess $vpp_pid -} function initiator_json_config() { # Prepare config file for iSCSI initiator jq . <<- JSON diff --git a/test/iscsi_tgt/digests/digests.sh b/test/iscsi_tgt/digests/digests.sh index 3a03c10ecbc..0d46c5dbbbf 100755 --- a/test/iscsi_tgt/digests/digests.sh +++ b/test/iscsi_tgt/digests/digests.sh @@ -5,9 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit function node_login_fio_logout() { for arg in "$@"; do @@ -57,7 +55,7 @@ timing_enter start_iscsi_tgt pid=$! echo "Process pid: $pid" -trap 'killprocess $pid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'killprocess $pid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT waitforlisten $pid $rpc_py iscsi_set_options -o 30 -a 16 @@ -91,4 +89,4 @@ trap - SIGINT SIGTERM EXIT iscsicleanup killprocess $pid -iscsitestfini $1 $2 +iscsitestfini diff --git a/test/iscsi_tgt/ext4test/ext4test.sh b/test/iscsi_tgt/ext4test/ext4test.sh index 8de41736787..600517a43c1 100755 --- a/test/iscsi_tgt/ext4test/ext4test.sh +++ b/test/iscsi_tgt/ext4test/ext4test.sh @@ -5,9 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit rpc_py="$rootdir/scripts/rpc.py" node_base="iqn.2013-06.com.intel.ch.spdk" @@ -18,7 +16,7 @@ timing_enter start_iscsi_tgt pid=$! echo "Process pid: $pid" -trap '$rpc_py bdev_split_delete Name0n1 || true; killprocess $pid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap '$rpc_py bdev_split_delete Name0n1 || true; killprocess $pid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT waitforlisten $pid $rpc_py iscsi_set_options -o 30 -a 4 -b $node_base @@ -43,7 +41,7 @@ iscsiadm -m node --login -p $TARGET_IP:$ISCSI_PORT waitforiscsidevices 1 trap 'for new_dir in $(dir -d /mnt/*dir); do umount $new_dir; rm -rf $new_dir; done; - iscsicleanup; killprocess $pid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT + iscsicleanup; killprocess $pid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT echo "Test error injection" $rpc_py bdev_error_inject_error EE_Malloc0 'all' 'failure' -n 1000 @@ -128,4 +126,4 @@ if [ -z "$NO_NVME" ]; then fi killprocess $pid -iscsitestfini $1 $2 +iscsitestfini diff --git a/test/iscsi_tgt/filesystem/filesystem.sh b/test/iscsi_tgt/filesystem/filesystem.sh index 156b5bde36b..c72c67f105f 100755 --- a/test/iscsi_tgt/filesystem/filesystem.sh +++ b/test/iscsi_tgt/filesystem/filesystem.sh @@ -6,9 +6,7 @@ source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh source $rootdir/scripts/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit rpc_py="$rootdir/scripts/rpc.py" # Remove lvol bdevs and stores. @@ -31,7 +29,7 @@ timing_enter start_iscsi_tgt pid=$! echo "Process pid: $pid" -trap 'killprocess $pid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'killprocess $pid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT waitforlisten $pid $rpc_py iscsi_set_options -o 30 -a 16 @@ -64,7 +62,7 @@ iscsiadm -m discovery -t sendtargets -p $TARGET_IP:$ISCSI_PORT iscsiadm -m node --login -p $TARGET_IP:$ISCSI_PORT waitforiscsidevices 1 -trap 'iscsicleanup; remove_backends; umount /mnt/device; rm -rf /mnt/device; killprocess $pid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'iscsicleanup; remove_backends; umount /mnt/device; rm -rf /mnt/device; killprocess $pid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT mkdir -p /mnt/device @@ -142,4 +140,4 @@ trap - SIGINT SIGTERM EXIT iscsicleanup remove_backends killprocess $pid -iscsitestfini $1 $2 +iscsitestfini diff --git a/test/iscsi_tgt/fio/fio.sh b/test/iscsi_tgt/fio/fio.sh index ae3a2f30880..dc072620ff1 100755 --- a/test/iscsi_tgt/fio/fio.sh +++ b/test/iscsi_tgt/fio/fio.sh @@ -5,9 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit delete_tmp_files() { rm -f $testdir/iscsi2.json @@ -94,7 +92,7 @@ iscsiadm -m discovery -t sendtargets -p $TARGET_IP:$ISCSI_PORT iscsiadm -m node --login -p $TARGET_IP:$ISCSI_PORT waitforiscsidevices 2 -trap 'iscsicleanup; killprocess $pid; iscsitestfini $1 $2; delete_tmp_files; exit 1' SIGINT SIGTERM EXIT +trap 'iscsicleanup; killprocess $pid; iscsitestfini; delete_tmp_files; exit 1' SIGINT SIGTERM EXIT $fio_py -p iscsi -i 4096 -d 1 -t randrw -r 1 -v $fio_py -p iscsi -i 131072 -d 32 -t randrw -r 1 -v @@ -147,4 +145,4 @@ trap - SIGINT SIGTERM EXIT killprocess $pid -iscsitestfini $1 $2 +iscsitestfini diff --git a/test/iscsi_tgt/fuzz/fuzz.sh b/test/iscsi_tgt/fuzz/fuzz.sh index bc290fa8f70..03237a909fb 100755 --- a/test/iscsi_tgt/fuzz/fuzz.sh +++ b/test/iscsi_tgt/fuzz/fuzz.sh @@ -5,9 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit if [ -z "$TARGET_IP" ]; then echo "TARGET_IP not defined in environment" @@ -47,7 +45,7 @@ $rpc_py bdev_malloc_create $MALLOC_BDEV_SIZE $MALLOC_BLOCK_SIZE $rpc_py iscsi_create_target_node disk1 disk1_alias 'Malloc0:0' $PORTAL_TAG:$INITIATOR_TAG 256 -d sleep 1 -trap 'killprocess $iscsipid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'killprocess $iscsipid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT $rootdir/test/app/fuzz/iscsi_fuzz/iscsi_fuzz -m 0xF0 -T $TARGET_IP -t 30 2> $output_dir/iscsi_autofuzz_logs.txt @@ -60,6 +58,6 @@ trap - SIGINT SIGTERM EXIT killprocess $iscsipid -iscsitestfini $1 $2 +iscsitestfini timing_exit iscsi_fuzz diff --git a/test/iscsi_tgt/initiator/initiator.sh b/test/iscsi_tgt/initiator/initiator.sh index 5da1f320be5..90a37c98d8e 100755 --- a/test/iscsi_tgt/initiator/initiator.sh +++ b/test/iscsi_tgt/initiator/initiator.sh @@ -5,9 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit MALLOC_BDEV_SIZE=64 MALLOC_BLOCK_SIZE=512 @@ -36,7 +34,7 @@ $rpc_py bdev_malloc_create $MALLOC_BDEV_SIZE $MALLOC_BLOCK_SIZE # "-d" ==> disable CHAP authentication $rpc_py iscsi_create_target_node disk1 disk1_alias 'Malloc0:0' $PORTAL_TAG:$INITIATOR_TAG 256 -d sleep 1 -trap 'killprocess $pid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'killprocess $pid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT "$rootdir/test/bdev/bdevperf/bdevperf" --json <(initiator_json_config) -q 128 -o 4096 -w verify -t 5 -s 512 if [ $RUN_NIGHTLY -eq 1 ]; then @@ -49,4 +47,4 @@ trap - SIGINT SIGTERM EXIT killprocess $pid -iscsitestfini $1 $2 +iscsitestfini diff --git a/test/iscsi_tgt/ip_migration/ip_migration.sh b/test/iscsi_tgt/ip_migration/ip_migration.sh index d737e01b359..80e93a6984a 100755 --- a/test/iscsi_tgt/ip_migration/ip_migration.sh +++ b/test/iscsi_tgt/ip_migration/ip_migration.sh @@ -5,9 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit rpc_py="$rootdir/scripts/rpc.py" fio_py="$rootdir/scripts/fio.py" @@ -97,7 +95,7 @@ for ((i = 0; i < 2; i++)); do timing_exit start_iscsi_tgt_$i rpc_config $rpc_addr $NETMASK - trap 'kill_all_iscsi_target; iscsitestfini $1 $2; exit 1' \ + trap 'kill_all_iscsi_target; iscsitestfini; exit 1' \ SIGINT SIGTERM EXIT done @@ -128,4 +126,4 @@ trap - SIGINT SIGTERM EXIT iscsicleanup $rpc_py -s $rpc_second_addr spdk_kill_instance SIGTERM -iscsitestfini $1 $2 +iscsitestfini diff --git a/test/iscsi_tgt/iscsi_tgt.sh b/test/iscsi_tgt/iscsi_tgt.sh index 0316229b60d..e3ee13254c4 100755 --- a/test/iscsi_tgt/iscsi_tgt.sh +++ b/test/iscsi_tgt/iscsi_tgt.sh @@ -9,47 +9,30 @@ fi source $rootdir/test/iscsi_tgt/common.sh -# $1 = test type (posix/vpp) -if [ "$1" == "posix" ] || [ "$1" == "vpp" ]; then - TEST_TYPE=$1 -else - echo "No iSCSI test type specified" - exit 1 -fi - # Run cleanup once to make sure we remove any stale iscsiadm # entries if they were missed in previous runs iscsicleanup # Network configuration -create_veth_interfaces $TEST_TYPE +create_veth_interfaces -trap 'cleanup_veth_interfaces $TEST_TYPE; exit 1' SIGINT SIGTERM EXIT +trap 'cleanup_veth_interfaces; exit 1' SIGINT SIGTERM EXIT -run_test "iscsi_tgt_sock" ./test/iscsi_tgt/sock/sock.sh $TEST_TYPE -if [ "$TEST_TYPE" == "posix" ]; then - # calsoft doesn't handle TCP stream properly and fails decoding iSCSI - # requests when are divided by TCP segmentation. This is very common - # situation for VPP and causes that calsoft.sh never PASS. - if [[ -d /usr/local/calsoft ]]; then - run_test "iscsi_tgt_calsoft" ./test/iscsi_tgt/calsoft/calsoft.sh - else - skip_run_test_with_warning "WARNING: Calsoft binaries not found, skipping test!" - fi +run_test "iscsi_tgt_sock" ./test/iscsi_tgt/sock/sock.sh +if [[ -d /usr/local/calsoft ]]; then + run_test "iscsi_tgt_calsoft" ./test/iscsi_tgt/calsoft/calsoft.sh +else + skip_run_test_with_warning "WARNING: Calsoft binaries not found, skipping test!" fi run_test "iscsi_tgt_filesystem" ./test/iscsi_tgt/filesystem/filesystem.sh run_test "iscsi_tgt_reset" ./test/iscsi_tgt/reset/reset.sh -run_test "iscsi_tgt_rpc_config" ./test/iscsi_tgt/rpc_config/rpc_config.sh $TEST_TYPE +run_test "iscsi_tgt_rpc_config" ./test/iscsi_tgt/rpc_config/rpc_config.sh run_test "iscsi_tgt_iscsi_lvol" ./test/iscsi_tgt/lvol/iscsi_lvol.sh run_test "iscsi_tgt_fio" ./test/iscsi_tgt/fio/fio.sh run_test "iscsi_tgt_qos" ./test/iscsi_tgt/qos/qos.sh - -# IP Migration tests do not support network namespaces, -# they can only be run on posix sockets. -if [ "$TEST_TYPE" == "posix" ]; then - run_test "iscsi_tgt_ip_migration" ./test/iscsi_tgt/ip_migration/ip_migration.sh -fi +run_test "iscsi_tgt_ip_migration" ./test/iscsi_tgt/ip_migration/ip_migration.sh run_test "iscsi_tgt_trace_record" ./test/iscsi_tgt/trace_record/trace_record.sh +run_test "iscsi_tgt_login_redirection" ./test/iscsi_tgt/login_redirection/login_redirection.sh if [ $RUN_NIGHTLY -eq 1 ]; then if [ $SPDK_TEST_PMDK -eq 1 ]; then @@ -59,32 +42,22 @@ if [ $RUN_NIGHTLY -eq 1 ]; then run_test "iscsi_tgt_digests" ./test/iscsi_tgt/digests/digests.sh fi if [ $SPDK_TEST_RBD -eq 1 ]; then - # RBD tests do not support network namespaces, - # they can only be run on posix sockets. - if [ "$TEST_TYPE" == "posix" ]; then - if ! hash ceph; then - echo "ERROR: SPDK_TEST_RBD requested but no ceph installed!" - false - fi - run_test "iscsi_tgt_rbd" ./test/iscsi_tgt/rbd/rbd.sh + if ! hash ceph; then + echo "ERROR: SPDK_TEST_RBD requested but no ceph installed!" + false fi + run_test "iscsi_tgt_rbd" ./test/iscsi_tgt/rbd/rbd.sh fi -trap 'cleanup_veth_interfaces $TEST_TYPE; exit 1' SIGINT SIGTERM EXIT +trap 'cleanup_veth_interfaces; exit 1' SIGINT SIGTERM EXIT if [ $SPDK_TEST_NVMF -eq 1 ]; then - # NVMe-oF tests do not support network namespaces, - # they can only be run on posix sockets. - if [ "$TEST_TYPE" == "posix" ]; then - # Test configure remote NVMe device from rpc and conf file - run_test "iscsi_tgt_fio_remote_nvme" ./test/iscsi_tgt/nvme_remote/fio_remote_nvme.sh - fi + # Test configure remote NVMe device from rpc and conf file + run_test "iscsi_tgt_fio_remote_nvme" ./test/iscsi_tgt/nvme_remote/fio_remote_nvme.sh fi if [ $RUN_NIGHTLY -eq 1 ]; then - if [ "$TEST_TYPE" == "posix" ]; then - run_test "iscsi_tgt_fuzz" ./test/iscsi_tgt/fuzz/fuzz.sh - fi + run_test "iscsi_tgt_fuzz" ./test/iscsi_tgt/fuzz/fuzz.sh run_test "iscsi_tgt_multiconnection" ./test/iscsi_tgt/multiconnection/multiconnection.sh fi @@ -93,5 +66,5 @@ if [ $SPDK_TEST_ISCSI_INITIATOR -eq 1 ]; then run_test "iscsi_tgt_bdev_io_wait" ./test/iscsi_tgt/bdev_io_wait/bdev_io_wait.sh fi -cleanup_veth_interfaces $TEST_TYPE +cleanup_veth_interfaces trap - SIGINT SIGTERM EXIT diff --git a/test/iscsi_tgt/login_redirection/login_redirection.sh b/test/iscsi_tgt/login_redirection/login_redirection.sh new file mode 100755 index 00000000000..824eb2e64b4 --- /dev/null +++ b/test/iscsi_tgt/login_redirection/login_redirection.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash + +testdir=$(readlink -f $(dirname $0)) +rootdir=$(readlink -f $testdir/../../..) +source $rootdir/test/common/autotest_common.sh +source $rootdir/test/iscsi_tgt/common.sh + +iscsitestinit + +NULL_BDEV_SIZE=64 +NULL_BLOCK_SIZE=512 + +rpc_py=$rootdir/scripts/rpc.py +fio_py=$rootdir/scripts/fio.py + +rpc_addr1="/var/tmp/spdk0.sock" +rpc_addr2="/var/tmp/spdk1.sock" + +# This test case uses two iSCSI target applications. + +timing_enter start_iscsi_tgts + +"${ISCSI_APP[@]}" -r $rpc_addr1 -i 0 -m 0x1 --wait-for-rpc & +pid1=$! +echo "Process pid: $pid1" + +"${ISCSI_APP[@]}" -r $rpc_addr2 -i 1 -m 0x2 --wait-for-rpc & +pid2=$! +echo "Process pid: $pid2" + +trap 'killprocess $pid1; killprocess $pid2; iscsitestfini; exit 1' SIGINT SIGTERM EXIT + +waitforlisten $pid1 $rpc_addr1 +$rpc_py -s $rpc_addr1 iscsi_set_options -w 0 -o 30 -a 16 +$rpc_py -s $rpc_addr1 framework_start_init +echo "iscsi_tgt_1 is listening." + +waitforlisten $pid2 $rpc_addr2 +$rpc_py -s $rpc_addr2 iscsi_set_options -w 0 -o 30 -a 16 +$rpc_py -s $rpc_addr2 framework_start_init +echo "iscsi_tgt_2 is listening." + +timing_exit start_iscsi_tgts + +# iSCSI target application 1: +# - Portal group 1 which is public and has a portal +# - Null bdev "Null0" whose size is 64MB and block length is 512. +# - Target node "iqn.2016-06.io.spdk:Target1" which has portal group 1 and Null0. +$rpc_py -s $rpc_addr1 iscsi_create_initiator_group $INITIATOR_TAG $INITIATOR_NAME $NETMASK +$rpc_py -s $rpc_addr1 iscsi_create_portal_group $PORTAL_TAG $TARGET_IP:$ISCSI_PORT +$rpc_py -s $rpc_addr1 bdev_null_create Null0 $NULL_BDEV_SIZE $NULL_BLOCK_SIZE +$rpc_py -s $rpc_addr1 iscsi_create_target_node Target1 Target1_alias 'Null0:0' "$PORTAL_TAG:$INITIATOR_TAG" 64 -d + +# iSCSI target application 2: +# - Portal group 1 which is private and has a portal +# - A null bdev Null0 whose size is 64MB and block length is 512. +# - Target node "iqn.2016-06.io.spdk:Target1" which has portal group 1 and Null0. +$rpc_py -s $rpc_addr2 iscsi_create_initiator_group $INITIATOR_TAG $INITIATOR_NAME $NETMASK +$rpc_py -s $rpc_addr2 iscsi_create_portal_group $PORTAL_TAG $TARGET_IP2:$ISCSI_PORT -p +$rpc_py -s $rpc_addr2 bdev_null_create Null0 $NULL_BDEV_SIZE $NULL_BLOCK_SIZE +$rpc_py -s $rpc_addr2 iscsi_create_target_node Target1 Target1_alias 'Null0:0' "$PORTAL_TAG:$INITIATOR_TAG" 64 -d + +iscsiadm -m discovery -t sendtargets -p $TARGET_IP:$ISCSI_PORT +iscsiadm -m node --login -p $TARGET_IP:$ISCSI_PORT +waitforiscsidevices 1 + +$fio_py -p iscsi -i 512 -d 1 -t randrw -r 15 & +fiopid=$! +echo "FIO pid: $fiopid" + +trap 'iscsicleanup; killprocess $pid1; killprocess $pid2; killprocess $fiopid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT + +[ "$($rpc_py -s $rpc_addr1 iscsi_get_connections | jq 'length')" = "1" ] +[ "$($rpc_py -s $rpc_addr2 iscsi_get_connections | jq 'length')" = "0" ] + +# Move among two portals by login redirection while FIO runs. + +$rpc_py -s $rpc_addr1 iscsi_target_node_set_redirect 'iqn.2016-06.io.spdk:Target1' $PORTAL_TAG -a $TARGET_IP2 -p $ISCSI_PORT +$rpc_py -s $rpc_addr1 iscsi_target_node_request_logout 'iqn.2016-06.io.spdk:Target1' -t $PORTAL_TAG + +sleep 5 + +[ "$($rpc_py -s $rpc_addr1 iscsi_get_connections | jq 'length')" = "0" ] +[ "$($rpc_py -s $rpc_addr2 iscsi_get_connections | jq 'length')" = "1" ] + +$rpc_py -s $rpc_addr1 iscsi_target_node_set_redirect 'iqn.2016-06.io.spdk:Target1' $PORTAL_TAG +$rpc_py -s $rpc_addr2 iscsi_target_node_request_logout 'iqn.2016-06.io.spdk:Target1' -t $PORTAL_TAG + +sleep 5 + +[ "$($rpc_py -s $rpc_addr1 iscsi_get_connections | jq 'length')" = "1" ] +[ "$($rpc_py -s $rpc_addr2 iscsi_get_connections | jq 'length')" = "0" ] + +wait $fiopid + +trap - SIGINT SIGTERM EXIT + +iscsicleanup +killprocess $pid1 +killprocess $pid2 +iscsitestfini diff --git a/test/iscsi_tgt/lvol/iscsi_lvol.sh b/test/iscsi_tgt/lvol/iscsi_lvol.sh index e55899d56de..ad975c63660 100755 --- a/test/iscsi_tgt/lvol/iscsi_lvol.sh +++ b/test/iscsi_tgt/lvol/iscsi_lvol.sh @@ -5,9 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit MALLOC_BDEV_SIZE=128 MALLOC_BLOCK_SIZE=512 @@ -28,7 +26,7 @@ timing_enter start_iscsi_tgt pid=$! echo "Process pid: $pid" -trap 'iscsicleanup; killprocess $pid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'iscsicleanup; killprocess $pid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT waitforlisten $pid $rpc_py iscsi_set_options -o 30 -a 16 @@ -82,4 +80,4 @@ trap - SIGINT SIGTERM EXIT rm -f ./local-job* iscsicleanup killprocess $pid -iscsitestfini $1 $2 +iscsitestfini diff --git a/test/iscsi_tgt/multiconnection/multiconnection.sh b/test/iscsi_tgt/multiconnection/multiconnection.sh index badf701971d..886ca74e8fb 100755 --- a/test/iscsi_tgt/multiconnection/multiconnection.sh +++ b/test/iscsi_tgt/multiconnection/multiconnection.sh @@ -5,9 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit rpc_py="$rootdir/scripts/rpc.py" fio_py="$rootdir/scripts/fio.py" @@ -38,7 +36,7 @@ timing_enter start_iscsi_tgt "${ISCSI_APP[@]}" --wait-for-rpc & iscsipid=$! echo "iSCSI target launched. pid: $iscsipid" -trap 'remove_backends; iscsicleanup; killprocess $iscsipid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'remove_backends; iscsicleanup; killprocess $iscsipid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT waitforlisten $iscsipid $rpc_py iscsi_set_options -o 30 -a 128 @@ -81,4 +79,4 @@ rm -f ./local-job* iscsicleanup remove_backends killprocess $iscsipid -iscsitestfini $1 $2 +iscsitestfini diff --git a/test/iscsi_tgt/nvme_remote/fio_remote_nvme.sh b/test/iscsi_tgt/nvme_remote/fio_remote_nvme.sh index 38329dc434d..65a2a168106 100755 --- a/test/iscsi_tgt/nvme_remote/fio_remote_nvme.sh +++ b/test/iscsi_tgt/nvme_remote/fio_remote_nvme.sh @@ -7,9 +7,7 @@ source $rootdir/test/nvmf/common.sh source $rootdir/test/iscsi_tgt/common.sh nvmftestinit -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit rpc_py="$rootdir/scripts/rpc.py" fio_py="$rootdir/scripts/fio.py" @@ -26,7 +24,7 @@ function run_nvme_remote() { "${ISCSI_APP[@]}" -r "$iscsi_rpc_addr" -m 0x1 -p 0 -s 512 --wait-for-rpc & iscsipid=$! echo "iSCSI target launched. pid: $iscsipid" - trap 'killprocess $iscsipid; iscsitestfini $1 $2; nvmftestfini; exit 1' SIGINT SIGTERM EXIT + trap 'killprocess $iscsipid; iscsitestfini; nvmftestfini; exit 1' SIGINT SIGTERM EXIT waitforlisten $iscsipid "$iscsi_rpc_addr" $rpc_py -s "$iscsi_rpc_addr" iscsi_set_options -o 30 -a 16 $rpc_py -s "$iscsi_rpc_addr" framework_start_init @@ -56,7 +54,7 @@ function run_nvme_remote() { "${NVMF_APP[@]}" -m 0x2 -p 1 -s 512 --wait-for-rpc & nvmfpid=$! echo "NVMf target launched. pid: $nvmfpid" -trap 'iscsitestfini $1 $2; nvmftestfini; exit 1' SIGINT SIGTERM EXIT +trap 'iscsitestfini; nvmftestfini; exit 1' SIGINT SIGTERM EXIT waitforlisten $nvmfpid $rpc_py framework_start_init $rpc_py nvmf_create_transport -t RDMA -u 8192 @@ -74,7 +72,7 @@ timing_enter start_iscsi_tgt run_nvme_remote "local" trap 'iscsicleanup; killprocess $iscsipid; - rm -f ./local-job0-0-verify.state; iscsitestfini $1 $2; nvmftestfini; exit 1' SIGINT SIGTERM EXIT + rm -f ./local-job0-0-verify.state; iscsitestfini; nvmftestfini; exit 1' SIGINT SIGTERM EXIT echo "Running FIO" $fio_py -p iscsi -i 4096 -d 1 -t randrw -r 1 -v @@ -95,5 +93,5 @@ iscsicleanup killprocess $iscsipid $rpc_py nvmf_delete_subsystem nqn.2016-06.io.spdk:cnode1 -iscsitestfini $1 $2 +iscsitestfini nvmftestfini diff --git a/test/iscsi_tgt/qos/qos.sh b/test/iscsi_tgt/qos/qos.sh index 0a8015e18a3..6690c1549d7 100755 --- a/test/iscsi_tgt/qos/qos.sh +++ b/test/iscsi_tgt/qos/qos.sh @@ -5,9 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit function run_fio() { local bdev_name=$1 @@ -62,7 +60,7 @@ timing_enter start_iscsi_tgt "${ISCSI_APP[@]}" & pid=$! echo "Process pid: $pid" -trap 'killprocess $pid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'killprocess $pid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT waitforlisten $pid echo "iscsi_tgt is listening. Running tests..." @@ -81,7 +79,7 @@ sleep 1 iscsiadm -m discovery -t sendtargets -p $TARGET_IP:$ISCSI_PORT iscsiadm -m node --login -p $TARGET_IP:$ISCSI_PORT -trap 'iscsicleanup; killprocess $pid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'iscsicleanup; killprocess $pid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT # Run FIO without any QOS limits to determine the raw performance run_fio Malloc0 @@ -142,4 +140,4 @@ rm -f ./local-job0-0-verify.state trap - SIGINT SIGTERM EXIT killprocess $pid -iscsitestfini $1 $2 +iscsitestfini diff --git a/test/iscsi_tgt/rbd/rbd.sh b/test/iscsi_tgt/rbd/rbd.sh index 060cc7af0d6..7ab0e0352ee 100755 --- a/test/iscsi_tgt/rbd/rbd.sh +++ b/test/iscsi_tgt/rbd/rbd.sh @@ -5,9 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit timing_enter rbd_setup rbd_setup $TARGET_IP $TARGET_NAMESPACE @@ -22,7 +20,7 @@ timing_enter start_iscsi_tgt "${ISCSI_APP[@]}" -m $ISCSI_TEST_CORE_MASK --wait-for-rpc & pid=$! -trap 'killprocess $pid; rbd_cleanup; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'killprocess $pid; rbd_cleanup; iscsitestfini; exit 1' SIGINT SIGTERM EXIT waitforlisten $pid $rpc_py iscsi_set_options -o 30 -a 16 @@ -69,4 +67,4 @@ $rpc_py bdev_rbd_delete $rbd_bdev killprocess $pid rbd_cleanup -iscsitestfini $1 $2 +iscsitestfini diff --git a/test/iscsi_tgt/reset/reset.sh b/test/iscsi_tgt/reset/reset.sh index 406a10c45eb..7b1d8ada7f1 100755 --- a/test/iscsi_tgt/reset/reset.sh +++ b/test/iscsi_tgt/reset/reset.sh @@ -5,9 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit MALLOC_BDEV_SIZE=64 MALLOC_BLOCK_SIZE=512 @@ -54,7 +52,7 @@ $fio_py -p iscsi -i 512 -d 1 -t read -r 60 & fiopid=$! echo "FIO pid: $fiopid" -trap 'iscsicleanup; killprocess $pid; killprocess $fiopid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'iscsicleanup; killprocess $pid; killprocess $fiopid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT # Do 3 resets while making sure iscsi_tgt and fio are still running for i in 1 2 3; do @@ -74,4 +72,4 @@ trap - SIGINT SIGTERM EXIT iscsicleanup killprocess $pid -iscsitestfini $1 $2 +iscsitestfini diff --git a/test/iscsi_tgt/rpc_config/rpc_config.py b/test/iscsi_tgt/rpc_config/rpc_config.py index 324dacbbe63..4f214eb2cd3 100755 --- a/test/iscsi_tgt/rpc_config/rpc_config.py +++ b/test/iscsi_tgt/rpc_config/rpc_config.py @@ -10,13 +10,12 @@ import random from subprocess import check_call, call, check_output, Popen, PIPE, CalledProcessError -if (len(sys.argv) == 8): +if (len(sys.argv) == 7): target_ip = sys.argv[2] initiator_ip = sys.argv[3] port = sys.argv[4] netmask = sys.argv[5] namespace = sys.argv[6] - test_type = sys.argv[7] ns_cmd = 'ip netns exec ' + namespace other_ip = '127.0.0.6' @@ -116,8 +115,6 @@ def verify_iscsi_connection_rpc_methods(rpc_py): jsonvalues = json.loads(output) verify(jsonvalues[0]['target_node_name'] == rpc_param['target_name'], 1, "target node name vaule is {}, expected {}".format(jsonvalues[0]['target_node_name'], rpc_param['target_name'])) - verify(jsonvalues[0]['id'] == 0, 1, - "device id value is {}, expected 0".format(jsonvalues[0]['id'])) verify(jsonvalues[0]['initiator_addr'] == rpc_param['initiator_ip'], 1, "initiator address values is {}, expected {}".format(jsonvalues[0]['initiator_addr'], rpc_param['initiator_ip'])) verify(jsonvalues[0]['target_addr'] == rpc_param['target_ip'], 1, @@ -457,28 +454,6 @@ def verify_net_interface_add_delete_ip_address(rpc_py): print("verify_net_interface_add_delete_ip_address passed.") -def verify_add_nvme_bdev_rpc_methods(rpc_py): - rpc = spdk_rpc(rpc_py) - test_pass = 0 - output = check_output(["lspci", "-mm", "-nn"]) - addrs = re.findall(r'^([0-9]{2}:[0-9]{2}.[0-9]) "Non-Volatile memory controller \[0108\]".*-p02', output.decode(), re.MULTILINE) - for addr in addrs: - ctrlr_address = "-b Nvme{} -t pcie -a 0000:{}".format(addrs.index(addr), addr) - rpc.bdev_nvme_attach_controller(ctrlr_address) - print("add nvme device passed first time") - test_pass = 0 - try: - rpc.bdev_nvme_attach_controller(ctrlr_address) - except Exception as e: - print("add nvme device passed second time") - test_pass = 1 - pass - else: - pass - verify(test_pass == 1, 1, "add nvme device passed second time") - print("verify_add_nvme_bdev_rpc_methods passed.") - - if __name__ == "__main__": rpc_py = sys.argv[1] @@ -486,17 +461,13 @@ def verify_add_nvme_bdev_rpc_methods(rpc_py): try: verify_log_flag_rpc_methods(rpc_py, rpc_param) verify_net_get_interfaces(rpc_py) - # Add/delete IP will not be supported in VPP. - # It has separate vppctl utility for that. - if test_type == 'posix': - verify_net_interface_add_delete_ip_address(rpc_py) + verify_net_interface_add_delete_ip_address(rpc_py) create_malloc_bdevs_rpc_methods(rpc_py, rpc_param) verify_portal_groups_rpc_methods(rpc_py, rpc_param) verify_initiator_groups_rpc_methods(rpc_py, rpc_param) verify_target_nodes_rpc_methods(rpc_py, rpc_param) verify_scsi_devices_rpc_methods(rpc_py) verify_iscsi_connection_rpc_methods(rpc_py) - verify_add_nvme_bdev_rpc_methods(rpc_py) except RpcException as e: print("{}. Exiting with status {}".format(e.message, e.retval)) raise e diff --git a/test/iscsi_tgt/rpc_config/rpc_config.sh b/test/iscsi_tgt/rpc_config/rpc_config.sh index ce54b4ab2f7..1d27c7867e1 100755 --- a/test/iscsi_tgt/rpc_config/rpc_config.sh +++ b/test/iscsi_tgt/rpc_config/rpc_config.sh @@ -5,16 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = test type posix or vpp. -# $2 = "iso" - triggers isolation mode (setting up required environment). -iscsitestinit $2 $1 - -if [ "$1" == "posix" ] || [ "$1" == "vpp" ]; then - TEST_TYPE=$1 -else - echo "No iSCSI test type specified" - exit 1 -fi +iscsitestinit MALLOC_BDEV_SIZE=64 @@ -51,7 +42,7 @@ sleep 1 timing_exit start_iscsi_tgt -$rpc_config_py $rpc_py $TARGET_IP $INITIATOR_IP $ISCSI_PORT $NETMASK $TARGET_NAMESPACE $TEST_TYPE +$rpc_config_py $rpc_py $TARGET_IP $INITIATOR_IP $ISCSI_PORT $NETMASK $TARGET_NAMESPACE $rpc_py bdev_get_bdevs @@ -60,4 +51,4 @@ trap - SIGINT SIGTERM EXIT iscsicleanup killprocess $pid -iscsitestfini $2 $1 +iscsitestfini diff --git a/test/iscsi_tgt/sock/sock.sh b/test/iscsi_tgt/sock/sock.sh index 14615d3bccb..74ea4c51ae3 100755 --- a/test/iscsi_tgt/sock/sock.sh +++ b/test/iscsi_tgt/sock/sock.sh @@ -62,29 +62,9 @@ function waitfortcp() { return $ret } -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 - -if [ "$1" == "iso" ]; then - TEST_TYPE=$2 -else - TEST_TYPE=$1 -fi - -if [ -z "$TEST_TYPE" ]; then - TEST_TYPE="posix" -fi - -if [ "$TEST_TYPE" != "posix" ] && [ "$TEST_TYPE" != "vpp" ]; then - echo "No correct sock implmentation specified" - exit 1 -fi +iscsitestinit HELLO_SOCK_APP="${TARGET_NS_CMD[*]} $SPDK_EXAMPLE_DIR/hello_sock" -if [ $SPDK_TEST_VPP -eq 1 ]; then - HELLO_SOCK_APP+=" -L sock_vpp" -fi SOCAT_APP="socat" # ---------------- @@ -96,13 +76,13 @@ echo "Testing client path" # start echo server using socat $SOCAT_APP tcp-l:$ISCSI_PORT,fork,bind=$INITIATOR_IP exec:'/bin/cat' & server_pid=$! -trap 'killprocess $server_pid;iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'killprocess $server_pid;iscsitestfini; exit 1' SIGINT SIGTERM EXIT waitfortcp $server_pid $INITIATOR_IP:$ISCSI_PORT # send message using hello_sock client message="**MESSAGE:This is a test message from the client**" -response=$(echo $message | $HELLO_SOCK_APP -H $INITIATOR_IP -P $ISCSI_PORT -N $TEST_TYPE) +response=$(echo $message | $HELLO_SOCK_APP -H $INITIATOR_IP -P $ISCSI_PORT -N "posix") if ! echo "$response" | grep -q "$message"; then exit 1 @@ -121,9 +101,9 @@ timing_exit sock_client timing_enter sock_server # start echo server using hello_sock echo server -$HELLO_SOCK_APP -H $TARGET_IP -P $ISCSI_PORT -S -N $TEST_TYPE & +$HELLO_SOCK_APP -H $TARGET_IP -P $ISCSI_PORT -S -N "posix" & server_pid=$! -trap 'killprocess $server_pid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'killprocess $server_pid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT waitforlisten $server_pid # send message to server using socat @@ -138,5 +118,5 @@ trap - SIGINT SIGTERM EXIT killprocess $server_pid -iscsitestfini $1 $2 +iscsitestfini timing_exit sock_server diff --git a/test/iscsi_tgt/trace_record/trace_record.sh b/test/iscsi_tgt/trace_record/trace_record.sh index baa7f39d46a..7e13838bac7 100755 --- a/test/iscsi_tgt/trace_record/trace_record.sh +++ b/test/iscsi_tgt/trace_record/trace_record.sh @@ -5,9 +5,7 @@ rootdir=$(readlink -f $testdir/../../..) source $rootdir/test/common/autotest_common.sh source $rootdir/test/iscsi_tgt/common.sh -# $1 = "iso" - triggers isolation mode (setting up required environment). -# $2 = test type posix or vpp. defaults to posix. -iscsitestinit $1 $2 +iscsitestinit TRACE_TMP_FOLDER=./tmp-trace TRACE_RECORD_OUTPUT=${TRACE_TMP_FOLDER}/record.trace @@ -42,7 +40,7 @@ echo "start iscsi_tgt with trace enabled" iscsi_pid=$! echo "Process pid: $iscsi_pid" -trap 'killprocess $iscsi_pid; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'killprocess $iscsi_pid; iscsitestfini; exit 1' SIGINT SIGTERM EXIT waitforlisten $iscsi_pid @@ -73,7 +71,7 @@ iscsiadm -m discovery -t sendtargets -p $TARGET_IP:$ISCSI_PORT iscsiadm -m node --login -p $TARGET_IP:$ISCSI_PORT waitforiscsidevices $((CONNECTION_NUMBER + 1)) -trap 'iscsicleanup; killprocess $iscsi_pid; killprocess $record_pid; delete_tmp_files; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'iscsicleanup; killprocess $iscsi_pid; killprocess $record_pid; delete_tmp_files; iscsitestfini; exit 1' SIGINT SIGTERM EXIT echo "Running FIO" $fio_py -p iscsi -i 131072 -d 32 -t randrw -r 1 @@ -88,7 +86,7 @@ for i in $(seq 0 $CONNECTION_NUMBER); do done echo -e $RPCS | $rpc_py -trap 'delete_tmp_files; iscsitestfini $1 $2; exit 1' SIGINT SIGTERM EXIT +trap 'delete_tmp_files; iscsitestfini; exit 1' SIGINT SIGTERM EXIT killprocess $iscsi_pid killprocess $record_pid @@ -132,4 +130,4 @@ for i in $(seq 0 $((len_arr_record_num - 1))); do done trap - SIGINT SIGTERM EXIT -iscsitestfini $1 $2 +iscsitestfini diff --git a/test/json_config/clear_config.py b/test/json_config/clear_config.py index 332e9466e39..bac1beebbce 100755 --- a/test/json_config/clear_config.py +++ b/test/json_config/clear_config.py @@ -157,6 +157,10 @@ def clear_vmd_subsystem(args, vmd_config): pass +def clear_sock_subsystem(args, sock_config): + pass + + def call_test_cmd(func): def rpc_test_cmd(*args, **kwargs): try: diff --git a/test/json_config/config_filter.py b/test/json_config/config_filter.py index 7a5cb4e80e5..cde2e24f96d 100755 --- a/test/json_config/config_filter.py +++ b/test/json_config/config_filter.py @@ -31,6 +31,7 @@ def filter_methods(do_remove_global_rpcs): 'bdev_set_options', 'bdev_nvme_set_options', 'bdev_nvme_set_hotplug', + 'sock_impl_set_options', ] data = json.loads(sys.stdin.read()) diff --git a/test/json_config/json_config.sh b/test/json_config/json_config.sh index 03d6bd5bd14..a5a714ccc25 100755 --- a/test/json_config/json_config.sh +++ b/test/json_config/json_config.sh @@ -326,7 +326,8 @@ function create_nvmf_subsystem_config() { function create_virtio_initiator_config() { timing_enter "${FUNCNAME[0]}" initiator_rpc bdev_virtio_attach_controller -t user -a /var/tmp/VhostScsiCtrlr0 -d scsi VirtioScsiCtrlr0 - initiator_rpc bdev_virtio_attach_controller -t user -a /var/tmp/VhostBlkCtrlr0 -d blk VirtioBlk0 + # FIXME: Specifying --vq-count is workaround issue #1583. + initiator_rpc bdev_virtio_attach_controller -t user -a /var/tmp/VhostBlkCtrlr0 -d blk --vq-count 2 VirtioBlk0 # TODO: initiator_rpc bdev_virtio_attach_controller -t user -a /var/tmp/VhostNvmeCtrlr0 -d nvme VirtioNvme0 timing_exit "${FUNCNAME[0]}" } diff --git a/test/make/check_so_deps.sh b/test/make/check_so_deps.sh index aea62237dd9..e91ded45726 100755 --- a/test/make/check_so_deps.sh +++ b/test/make/check_so_deps.sh @@ -17,7 +17,7 @@ source "$rootdir/test/common/autotest_common.sh" libdir="$rootdir/build/lib" libdeps_file="$rootdir/mk/spdk.lib_deps.mk" -source_abi_dir="$HOME/spdk_20_04/build/lib" +source_abi_dir="$HOME/spdk_abi_latest/build/lib" suppression_file="$HOME/abigail_suppressions.ini" function confirm_abi_deps() { @@ -34,250 +34,14 @@ function confirm_abi_deps() { fi cat << EOF > ${suppression_file} -[suppress_variable] - name = SPDK_LOG_IDXD -[suppress_variable] - name = SPDK_LOG_IOAT -[suppress_variable] - name = SPDK_LOG_JSON_UTIL -[suppress_variable] - name = SPDK_LOG_RPC -[suppress_variable] - name = SPDK_LOG_RPC_CLIENT -[suppress_function] - name = spdk_jsonrpc_server_handle_request -[suppress_function] - name = spdk_jsonrpc_server_handle_error -[suppress_function] - name = spdk_jsonrpc_server_send_response -[suppress_function] - name = spdk_jsonrpc_parse_request -[suppress_function] - name = spdk_jsonrpc_free_request -[suppress_function] - name = spdk_jsonrpc_parse_response -[suppress_variable] - name = SPDK_LOG_LOG_RPC -[suppress_variable] - name = SPDK_LOG_LOG -[suppress_variable] - name = SPDK_LOG_LVOL -[suppress_variable] - name = SPDK_LOG_NBD -[suppress_function] - name = spdk_nbd_disk_find_by_nbd_path -[suppress_function] - name = spdk_nbd_disk_first -[suppress_function] - name = spdk_nbd_disk_next -[suppress_function] - name = spdk_nbd_disk_get_nbd_path -[suppress_function] - name = spdk_nbd_disk_get_bdev_name -[suppress_variable] - name = SPDK_LOG_NET -[suppress_function] - name = spdk_interface_net_interface_add_ip_address -[suppress_function] - name = spdk_interface_net_interface_delete_ip_address -[suppress_function] - name = spdk_interface_get_list -[suppress_function] - name = spdk_get_uevent -[suppress_function] - name = spdk_uevent_connect -[suppress_function] - name = spdk_nvme_ctrlr_get_current_process -[suppress_function] - name = spdk_nvme_ctrlr_get_process -[suppress_function] - name = spdk_nvme_get_ctrlr_by_trid_unsafe -[suppress_function] - name = spdk_nvme_io_msg_process -[suppress_function] - name = spdk_nvme_wait_for_completion -[suppress_function] - name = spdk_nvme_wait_for_completion_robust_lock -[suppress_function] - name = spdk_nvme_wait_for_completion_timeout -[suppress_variable] - name = SPDK_LOG_NVME -[suppress_variable] - name = SPDK_LOG_OPAL -[suppress_variable] - name = spdk_opal_method -[suppress_variable] - name = spdk_opal_uid -[suppress_variable] - name = SPDK_LOG_REDUCE -[suppress_variable] - name = SPDK_LOG_THREAD -[suppress_variable] - name = SPDK_LOG_TRACE -[suppress_function] - name = spdk_crc32_table_init -[suppress_function] - name = spdk_crc32_update -[suppress_variable] - name = SPDK_LOG_VIRTIO_DEV -[suppress_variable] - name = SPDK_LOG_VIRTIO_PCI -[suppress_variable] - name = SPDK_LOG_VIRTIO_USER -[suppress_variable] - name = SPDK_LOG_VMD -[suppress_variable] - name = SPDK_LOG_ACCEL_IDXD -[suppress_variable] - name = SPDK_LOG_ACCEL_IOAT -[suppress_variable] - name = SPDK_LOG_AIO -[suppress_variable] - name = SPDK_LOG_VBDEV_COMPRESS -[suppress_variable] - name = SPDK_LOG_CRYPTO -[suppress_variable] - name = SPDK_LOG_VBDEV_DELAY -[suppress_function] - name = spdk_vbdev_error_create -[suppress_function] - name = spdk_vbdev_error_delete -[suppress_function] - name = spdk_vbdev_error_inject_error -[suppress_variable] - name = SPDK_LOG_BDEV_FTL -[suppress_variable] - name = SPDK_LOG_GPT_PARSE -[suppress_variable] - name = SPDK_LOG_VBDEV_GPT -[suppress_function] - name = spdk_gpt_parse_mbr -[suppress_function] - name = spdk_gpt_parse_partition_table -[suppress_variable] - name = SPDK_LOG_ISCSI_INIT -[suppress_variable] - name = SPDK_LOG_LVOL_RPC -[suppress_variable] - name = SPDK_LOG_VBDEV_LVOL -[suppress_variable] - name = SPDK_LOG_BDEV_MALLOC -[suppress_variable] - name = SPDK_LOG_BDEV_NULL -[suppress_variable] - name = SPDK_LOG_BDEV_NVME -[suppress_function] - name = spdk_bdev_nvme_create -[suppress_function] - name = spdk_bdev_nvme_delete -[suppress_function] - name = spdk_bdev_nvme_get_ctrlr -[suppress_function] - name = spdk_bdev_nvme_get_io_qpair -[suppress_function] - name = spdk_bdev_nvme_get_opts -[suppress_function] - name = spdk_bdev_nvme_set_hotplug -[suppress_function] - name = spdk_bdev_nvme_set_opts -[suppress_function] - name = spdk_vbdev_opal_create -[suppress_function] - name = spdk_vbdev_opal_destruct -[suppress_function] - name = spdk_vbdev_opal_enable_new_user -[suppress_function] - name = spdk_vbdev_opal_get_info_from_bdev -[suppress_function] - name = spdk_vbdev_opal_set_lock_state -[suppress_variable] - name = SPDK_LOG_BDEV_OCSSD -[suppress_variable] - name = SPDK_LOG_VBDEV_OPAL -[suppress_variable] - name = SPDK_LOG_OCFCTX -[suppress_variable] - name = SPDK_LOG_VBDEV_PASSTHRU -[suppress_variable] - name = SPDK_LOG_BDEV_PMEM -[suppress_function] - name = spdk_create_pmem_disk -[suppress_function] - name = spdk_delete_pmem_disk -[suppress_variable] - name = SPDK_LOG_BDEV_RAID -[suppress_variable] - name = SPDK_LOG_BDEV_RAID0 -[suppress_variable] - name = SPDK_LOG_BDEV_RAID5 -[suppress_variable] - name = SPDK_LOG_RAID_RPC -[suppress_variable] - name = SPDK_LOG_BDEV_RBD -[suppress_function] - name = spdk_bdev_rbd_create -[suppress_function] - name = spdk_bdev_rbd_delete -[suppress_function] - name = spdk_bdev_rbd_dup_config -[suppress_function] - name = spdk_bdev_rbd_free_config -[suppress_function] - name = spdk_bdev_rbd_resize -[suppress_variable] - name = SPDK_LOG_VBDEV_SPLIT -[suppress_function] - name = spdk_vbdev_split_destruct -[suppress_function] - name = spdk_vbdev_split_get_part_base -[suppress_variable] - name = SPDK_LOG_URING -[suppress_variable] - name = SPDK_LOG_VIRTIO -[suppress_variable] - name = SPDK_LOG_VIRTIO_BLK -[suppress_variable] - name = SPDK_LOG_VBDEV_ZONE_BLOCK -[suppress_function] - name = spdk_vbdev_zone_block_create -[suppress_function] - name = spdk_vbdev_zone_block_delete -[suppress_variable] - name = SPDK_LOG_BLOBFS_BDEV -[suppress_variable] - name = SPDK_LOG_BLOBFS_BDEV_RPC -[suppress_function] - name = spdk_blobfs_fuse_send_request -[suppress_function] - name = spdk_blobfs_fuse_start -[suppress_function] - name = spdk_blobfs_fuse_stop -[suppress_variable] - name = SPDK_LOG_APP_RPC -[suppress_function] - name = spdk_nvmf_parse_conf -[suppress_variable] - name = SPDK_LOG_VHOST -[suppress_variable] - name = SPDK_LOG_VHOST_BLK -[suppress_variable] - name = SPDK_LOG_VHOST_BLK_DATA -[suppress_variable] - name = SPDK_LOG_VHOST_RING -[suppress_variable] - name = SPDK_LOG_VHOST_RPC -[suppress_variable] - name = SPDK_LOG_VHOST_SCSI -[suppress_variable] - name = SPDK_LOG_VHOST_SCSI_DATA -[suppress_variable] - name = SPDK_LOG_VHOST_SCSI_QUEUE -[suppress_variable] - name = spdk_vhost_scsi_device_backend [suppress_type] - name = spdk_net_impl + name = spdk_nvme_ctrlr_data [suppress_type] - name = spdk_lvol + name = spdk_nvme_ns_data +[suppress_type] + name = spdk_nvme_log_page +[suppress_type] + name = spdk_nvme_ctrlr_opts EOF for object in "$libdir"/libspdk_*.so; do @@ -286,10 +50,16 @@ EOF echo "No corresponding object for $so_file in canonical directory. Skipping." continue fi + if [ "$so_file" == "libspdk_blobfs_bdev.so" ]; then + # FIXME: Disable checking for blobfs_bdev.so. Allows updating ABI reference repo + # without affecting outstanding patches and requiring immediate rebase. + echo "Checking objects for $so_file temporarily disabled. Skipping." + continue + fi - if ! output=$(abidiff "$source_abi_dir/$so_file" "$libdir/$so_file" --leaf-changes-only --suppressions $suppression_file --stat); then + if ! output=$(abidiff "$source_abi_dir/$so_file" "$libdir/$so_file" --headers-dir1 "$source_abi_dir/../../include/" --headers-dir2 "$rootdir/include" --leaf-changes-only --suppressions $suppression_file --stat); then # remove any filtered out variables. - output=${output// [()][^)]*[)]/} + output=$(sed "s/ [()][^)]*[)]//g" <<< "$output") IFS="." read -r _ _ new_so_maj new_so_min < <(readlink "$libdir/$so_file") IFS="." read -r _ _ old_so_maj old_so_min < <(readlink "$source_abi_dir/$so_file") @@ -481,13 +251,18 @@ echo "---------------------------------------------------------------------" # users can define their own environment abstraction. However we do want to still check it # for dependencies to avoid printing out a bunch of confusing symbols under the missing # symbols section. -SPDK_LIBS=$(ls -1 $libdir/libspdk_*.so | grep -v libspdk_env_dpdk.so) +# FIXME: Disable checking for blobfs_bdev.so. Allows updating ABI reference repo +# without affecting outstanding patches and requiring immediate rebase. +SPDK_LIBS=$(ls -1 $libdir/libspdk_*.so | grep -v libspdk_env_dpdk.so | grep -v blobfs_bdev.so) DEP_LIBS=$(ls -1 $libdir/libspdk_*.so) IGNORED_LIBS=() if grep -q 'CONFIG_VHOST_INTERNAL_LIB?=n' $rootdir/mk/config.mk; then IGNORED_LIBS+=("rte_vhost") fi +if grep -q 'CONFIG_RDMA?=n' $rootdir/mk/config.mk; then + IGNORED_LIBS+=("rdma") +fi ( for lib in $SPDK_LIBS; do confirm_deps $lib & done diff --git a/test/nvme/cuse/nvme_ns_manage_cuse.sh b/test/nvme/cuse/nvme_ns_manage_cuse.sh index dae0baee097..bb708cc47be 100755 --- a/test/nvme/cuse/nvme_ns_manage_cuse.sh +++ b/test/nvme/cuse/nvme_ns_manage_cuse.sh @@ -15,7 +15,6 @@ sleep 1 bdfs=$(get_nvme_bdfs) $rootdir/scripts/setup.sh reset -sleep 1 # Find bdf that supports Namespace Managment for bdf in $bdfs; do @@ -56,10 +55,12 @@ function clean_up() { $rootdir/scripts/setup.sh reset # This assumes every NVMe controller contains single namespace, - # encompassing Total NVM Capacity and formatted as 4k block size. + # encompassing Total NVM Capacity and formatted as 512 block size. + # 512 block size is needed for test/vhost/vhost_boot.sh to + # succesfully run. tnvmcap=$($NVME_CMD id-ctrl ${nvme_dev} | grep tnvmcap | cut -d: -f2) - blksize=4096 + blksize=512 size=$((tnvmcap / blksize)) diff --git a/test/nvme/cuse/spdk_nvme_cli_cuse.sh b/test/nvme/cuse/spdk_nvme_cli_cuse.sh index ce1affbca25..da3698c8a54 100755 --- a/test/nvme/cuse/spdk_nvme_cli_cuse.sh +++ b/test/nvme/cuse/spdk_nvme_cli_cuse.sh @@ -17,18 +17,17 @@ rpc_py=$rootdir/scripts/rpc.py bdf=$(get_first_nvme_bdf) PCI_WHITELIST="${bdf}" $rootdir/scripts/setup.sh reset -sleep 1 nvme_name=$(get_nvme_ctrlr_from_bdf ${bdf}) if [[ -z "$nvme_name" ]]; then echo "setup.sh failed bind kernel driver to ${bdf}" return 1 fi -set +e - ctrlr="/dev/${nvme_name}" ns="/dev/${nvme_name}n1" +waitforblk "${nvme_name}n1" + oacs=$(${NVME_CMD} id-ctrl $ctrlr | grep oacs | cut -d: -f2) oacs_firmware=$((oacs & 0x4)) @@ -47,8 +46,6 @@ ${NVME_CMD} get-feature $ctrlr -f 1 -s 1 -l 100 > ${KERNEL_OUT}.8 ${NVME_CMD} get-log $ctrlr -i 1 -l 100 > ${KERNEL_OUT}.9 ${NVME_CMD} reset $ctrlr > ${KERNEL_OUT}.10 -set -e - $rootdir/scripts/setup.sh $SPDK_BIN_DIR/spdk_tgt -m 0x3 & @@ -60,23 +57,17 @@ waitforlisten $spdk_tgt_pid $rpc_py bdev_nvme_attach_controller -b Nvme0 -t PCIe -a ${bdf} $rpc_py bdev_nvme_cuse_register -n Nvme0 -sleep 5 - -if [ ! -c /dev/spdk/nvme0 ]; then - return 1 -fi +ctrlr="/dev/spdk/nvme0" +ns="${ctrlr}n1" +waitforfile "$ns" $rpc_py bdev_get_bdevs $rpc_py bdev_nvme_get_controllers -set +e - -ns="/dev/spdk/nvme0n1" ${NVME_CMD} get-ns-id $ns > ${CUSE_OUT}.1 ${NVME_CMD} id-ns $ns > ${CUSE_OUT}.2 ${NVME_CMD} list-ns $ns > ${CUSE_OUT}.3 -ctrlr="/dev/spdk/nvme0" ${NVME_CMD} id-ctrl $ctrlr > ${CUSE_OUT}.4 ${NVME_CMD} list-ctrl $ctrlr > ${CUSE_OUT}.5 if [ "$oacs_firmware" -ne "0" ]; then @@ -88,8 +79,6 @@ ${NVME_CMD} get-feature $ctrlr -f 1 -s 1 -l 100 > ${CUSE_OUT}.8 ${NVME_CMD} get-log $ctrlr -i 1 -l 100 > ${CUSE_OUT}.9 ${NVME_CMD} reset $ctrlr > ${CUSE_OUT}.10 -set -e - for i in {1..10}; do if [ -f "${KERNEL_OUT}.${i}" ] && [ -f "${CUSE_OUT}.${i}" ]; then sed -i "s/${nvme_name}/nvme0/g" ${KERNEL_OUT}.${i} @@ -99,9 +88,20 @@ done rm -Rf $testdir/match_files -if [ ! -c "$ctrlr" ]; then - return 1 -fi +# Verify read/write path +tr < /dev/urandom -dc "a-zA-Z0-9" | fold -w 512 | head -n 1 > $testdir/write_file +${NVME_CMD} write $ns --data-size=512 --data=$testdir/write_file +${NVME_CMD} read $ns --data-size=512 --data=$testdir/read_file +diff --ignore-trailing-space $testdir/write_file $testdir/read_file +rm -f $testdir/write_file $testdir/read_file + +# Verify admin cmd when no data is transferred, +# by creating and deleting completion queue. +${NVME_CMD} admin-passthru $ctrlr -o 5 --cdw10=0x3ff0003 --cdw11=0x1 -r +${NVME_CMD} admin-passthru $ctrlr -o 4 --cdw10=0x3 + +[[ -c "$ctrlr" ]] +[[ -c "$ns" ]] trap - SIGINT SIGTERM EXIT killprocess $spdk_tgt_pid diff --git a/test/nvme/cuse/spdk_smartctl_cuse.sh b/test/nvme/cuse/spdk_smartctl_cuse.sh index a92ca119936..69532ae58f4 100755 --- a/test/nvme/cuse/spdk_smartctl_cuse.sh +++ b/test/nvme/cuse/spdk_smartctl_cuse.sh @@ -11,7 +11,6 @@ rpc_py=$rootdir/scripts/rpc.py bdf=$(get_first_nvme_bdf) PCI_WHITELIST="${bdf}" $rootdir/scripts/setup.sh reset -sleep 1 nvme_name=$(get_nvme_ctrlr_from_bdf ${bdf}) if [[ -z "$nvme_name" ]]; then echo "setup.sh failed bind kernel driver to ${bdf}" diff --git a/test/nvme/hotplug.sh b/test/nvme/hotplug.sh index c4494897a59..13011e19355 100755 --- a/test/nvme/hotplug.sh +++ b/test/nvme/hotplug.sh @@ -97,6 +97,8 @@ timing_enter copy_repo files_to_copy="scripts " files_to_copy+="include/spdk/pci_ids.h " files_to_copy+="build/examples/hotplug " +files_to_copy+="build/lib " +files_to_copy+="dpdk/build/lib " ( cd "$rootdir" tar -cf - $files_to_copy @@ -107,14 +109,14 @@ insert_devices timing_enter hotplug_test -ssh_vm "build/examples/hotplug -i 0 -t 25 -n 4 -r 8" & +ssh_vm "LD_LIBRARY_PATH=/root//build/lib:/root/dpdk/build/lib:$LD_LIBRARY_PATH build/examples/hotplug -i 0 -t 25 -n 4 -r 8" & example_pid=$! -sleep 4 +sleep 6 remove_devices sleep 4 insert_devices -sleep 4 +sleep 6 remove_devices devices_delete diff --git a/test/nvme/hw_hotplug.sh b/test/nvme/hw_hotplug.sh index bea5c759983..ba9c5946357 100755 --- a/test/nvme/hw_hotplug.sh +++ b/test/nvme/hw_hotplug.sh @@ -4,6 +4,10 @@ testdir=$(readlink -f $(dirname $0)) rootdir=$(readlink -f $testdir/../..) source $rootdir/test/common/autotest_common.sh +export SPDK_LIB_DIR="$rootdir/build/lib" +export DPDK_LIB_DIR="$rootdir/dpdk/build/lib" +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$SPDK_LIB_DIR:$DPDK_LIB_DIR + function insert_device() { ssh root@$ip 'Beetle --SetGpio "$gpio" HIGH' waitforblk $name diff --git a/test/nvme/nvme.sh b/test/nvme/nvme.sh index 74ba496cb76..9269312860d 100755 --- a/test/nvme/nvme.sh +++ b/test/nvme/nvme.sh @@ -50,8 +50,6 @@ if [ $(uname) = Linux ]; then # check that our setup.sh script does not bind NVMe devices to uio/vfio if they # have an active mountpoint $rootdir/scripts/setup.sh reset - # give kernel nvme driver some time to create the block devices before we start looking for them - sleep 1 blkname='' # first, find an NVMe device that does not have an active mountpoint already; # this covers rare case where someone is running this test script on a system diff --git a/test/nvme/nvme_opal.sh b/test/nvme/nvme_opal.sh index 66407ced7a9..3fba5c11a28 100755 --- a/test/nvme/nvme_opal.sh +++ b/test/nvme/nvme_opal.sh @@ -8,16 +8,34 @@ rpc_py="$rootdir/scripts/rpc.py" source "$rootdir/scripts/common.sh" source "$rootdir/test/common/autotest_common.sh" -function opal_init() { - bdf1=$($rootdir/scripts/gen_nvme.sh --json | jq -r '.config[].params | select(.name=="Nvme0").traddr') - $rpc_py bdev_nvme_attach_controller -b "nvme0" -t "pcie" -a $bdf1 +# The OPAL CI tests is only used for P4510 devices. +mapfile -t bdfs < <(get_nvme_bdfs_by_id 0x0a54) +if [[ -z ${bdfs[0]} ]]; then + echo "No P4510 device found, exit the tests" + exit 1 +fi - # Ignore bdev_nvme_opal_init failure because sometimes revert TPer might fail and - # in another run we don't want init to return errors to stop other tests. - $rpc_py bdev_nvme_opal_init -b nvme0 -p test || true +bdf=${bdfs[0]} + +function opal_revert_and_init() { + $SPDK_BIN_DIR/spdk_tgt & + spdk_tgt_pid=$! + trap 'killprocess $spdk_tgt_pid; exit 1' SIGINT SIGTERM EXIT + waitforlisten $spdk_tgt_pid + + $rootdir/scripts/rpc.py bdev_nvme_attach_controller -b "nvme0" -t "pcie" -a ${bdf} + # Ignore if this fails. + $rootdir/scripts/rpc.py bdev_nvme_opal_revert -b nvme0 -p test || true + sleep 1 + $rpc_py bdev_nvme_opal_init -b nvme0 -p test + $rpc_py bdev_nvme_detach_controller nvme0 + + killprocess $spdk_tgt_pid } function test_opal_cmds() { + $rpc_py bdev_nvme_attach_controller -b "nvme0" -t "pcie" -a ${bdf} + $rpc_py bdev_opal_create -b nvme0 -n 1 -i 1 -s 0 -l 1024 -p test $rpc_py bdev_opal_create -b nvme0 -n 1 -i 2 -s 1024 -l 512 -p test $rpc_py bdev_opal_get_info -b nvme0n1r1 -p test @@ -49,7 +67,8 @@ function test_opal_cmds() { } function setup_test_environment() { - $rpc_py bdev_nvme_attach_controller -b "nvme0" -t "pcie" -a $bdf1 + $rpc_py bdev_nvme_attach_controller -b "nvme0" -t "pcie" -a ${bdf} + $rpc_py bdev_opal_create -b nvme0 -n 1 -i 1 -s 0 -l 1024 -p test $rpc_py bdev_opal_create -b nvme0 -n 1 -i 2 -s 1024 -l 512 -p test $rpc_py bdev_opal_create -b nvme0 -n 1 -i 3 -s 4096 -l 4096 -p test @@ -69,16 +88,14 @@ function clean_up() { } function revert() { - # Ignore revert failure and kill the process - $rpc_py bdev_nvme_opal_revert -b nvme0 -p test || true + $rpc_py bdev_nvme_opal_revert -b nvme0 -p test } function opal_spdk_tgt() { $SPDK_BIN_DIR/spdk_tgt & spdk_tgt_pid=$! - trap 'revert; killprocess $spdk_tgt_pid; exit 1' SIGINT SIGTERM EXIT + trap 'killprocess $spdk_tgt_pid; exit 1' SIGINT SIGTERM EXIT waitforlisten $spdk_tgt_pid - opal_init test_opal_cmds killprocess $spdk_tgt_pid } @@ -86,11 +103,12 @@ function opal_spdk_tgt() { function opal_bdevio() { $rootdir/test/bdev/bdevio/bdevio -w & bdevio_pid=$! - trap 'revert; killprocess $bdevio_pid; exit 1' SIGINT SIGTERM EXIT + trap 'killprocess $bdevio_pid; exit 1' SIGINT SIGTERM EXIT waitforlisten $bdevio_pid setup_test_environment $rootdir/test/bdev/bdevio/tests.py perform_tests clean_up + $rpc_py bdev_nvme_detach_controller nvme0 trap - SIGINT SIGTERM EXIT killprocess $bdevio_pid } @@ -104,10 +122,13 @@ function opal_bdevperf() { $rootdir/test/bdev/bdevperf/bdevperf.py perform_tests clean_up revert + $rpc_py bdev_nvme_detach_controller nvme0 trap - SIGINT SIGTERM EXIT killprocess $bdevperf_pid } +opal_revert_and_init + run_test "nvme_opal_spdk_tgt" opal_spdk_tgt run_test "nvme_opal_bdevio" opal_bdevio run_test "nvme_opal_bdevperf" opal_bdevperf diff --git a/test/nvme/perf/common.sh b/test/nvme/perf/common.sh index 413814c2730..721cf01f080 100755 --- a/test/nvme/perf/common.sh +++ b/test/nvme/perf/common.sh @@ -1,40 +1,5 @@ #!/usr/bin/env bash -set -e -BASE_DIR=$(readlink -f $(dirname $0)) -ROOT_DIR=$(readlink -f $BASE_DIR/../../..) -rootdir=$ROOT_DIR -PLUGIN_DIR=$ROOT_DIR/build/fio -BDEVPERF_DIR=$ROOT_DIR/test/bdev/bdevperf -NVMEPERF_DIR=$ROOT_DIR/build/examples/perf -. $ROOT_DIR/scripts/common.sh || exit 1 -. $ROOT_DIR/test/common/autotest_common.sh -NVME_FIO_RESULTS=$BASE_DIR/result.json - -declare -A KERNEL_ENGINES -KERNEL_ENGINES=( - ["kernel-libaio"]="--ioengine=libaio" - ["kernel-classic-polling"]="--ioengine=pvsync2 --hipri=100" - ["kernel-hybrid-polling"]="--ioengine=pvsync2 --hipri=100" - ["kernel-io-uring"]="--ioengine=io_uring") - -RW=randrw -MIX=100 -IODEPTH=256 -BLK_SIZE=4096 -RUNTIME=600 -RAMP_TIME=30 -NUMJOBS=1 -REPEAT_NO=3 -FIO_BIN=$CONFIG_FIO_SOURCE_DIR/fio -PLUGIN="nvme" -DISKNO=1 -CPUS_ALLOWED=1 -NOIOSCALING=false -PRECONDITIONING=true -ONEWORKLOAD=false -DATE="$(date +'%m_%d_%Y_%H%M%S')" - function discover_bdevs() { local rootdir=$1 local config_file=$2 @@ -71,6 +36,68 @@ function discover_bdevs() { rm -f /var/run/spdk_bdev0 } +function create_spdk_bdev_conf() { + local output + local disk_cfg + local bdev_io_cache_size=$1 + local bdev_io_pool_size=$2 + local bdev_json_cfg=() + local bdev_opts=() + + disk_cfg=($(grep -vP "^\s*#" "$DISKCFG")) + + if [[ -n "$bdev_io_cache_size" ]]; then + bdev_opts+=("\"bdev_io_cache_size\": $bdev_io_cache_size") + fi + + if [[ -n "$bdev_io_pool_size" ]]; then + bdev_opts+=("\"bdev_io_pool_size\": $bdev_io_pool_size") + fi + + local IFS="," + if [[ ${#bdev_opts[@]} -gt 0 ]]; then + bdev_json_cfg+=("$( + cat <<- JSON + { + "method": "bdev_set_options", + "params": { + ${bdev_opts[*]} + } + } + JSON + )") + fi + + for i in "${!disk_cfg[@]}"; do + bdev_json_cfg+=("$( + cat <<- JSON + { + "method": "bdev_nvme_attach_controller", + "params": { + "trtype": "PCIe", + "name":"Nvme${i}", + "traddr":"${disk_cfg[i]}" + } + } + JSON + )") + done + + local IFS="," + jq -r '.' <<- JSON > $testdir/bdev.conf + { + "subsystems": [ + { + "subsystem": "bdev", + "config": [ + ${bdev_json_cfg[*]} + ] + } + ] + } + JSON +} + function is_bdf_not_mounted() { local bdf=$1 local blkname @@ -108,35 +135,43 @@ function get_numa_node() { done elif [[ "$plugin" =~ "bdev" ]]; then local bdevs - bdevs=$(discover_bdevs $ROOT_DIR $BASE_DIR/bdev.conf --json) + bdevs=$(discover_bdevs $rootdir $testdir/bdev.conf --json) for name in $disks; do local bdev_bdf bdev_bdf=$(jq -r ".[] | select(.name==\"$name\").driver_specific.nvme.pci_address" <<< $bdevs) cat /sys/bus/pci/devices/$bdev_bdf/numa_node done else - # Only target not mounted NVMes - for bdf in $(get_nvme_bdfs); do - if is_bdf_not_mounted $bdf; then - cat /sys/bus/pci/devices/$bdf/numa_node - fi + for name in $disks; do + local bdf + # Not reading directly from /sys/block/nvme* because of a kernel bug + # which results in NUMA 0 always getting reported. + bdf=$(cat /sys/block/$name/device/address) + cat /sys/bus/pci/devices/$bdf/numa_node done fi } function get_disks() { local plugin=$1 + local disk_cfg + + disk_cfg=($(grep -vP "^\s*#" "$DISKCFG")) if [[ "$plugin" =~ "nvme" ]]; then - for bdf in $(get_nvme_bdfs); do - echo "$bdf" - done + # PCI BDF address is enough for nvme-perf and nvme-fio-plugin, + # so just print them from configuration file + echo "${disk_cfg[*]}" elif [[ "$plugin" =~ "bdev" ]]; then + # Generate NvmeXn1 bdev name configuration file for bdev-perf + # and bdev-fio-plugin local bdevs - bdevs=$(discover_bdevs $ROOT_DIR $BASE_DIR/bdev.conf --json) - jq -r '.[].name' <<< $bdevs + local disk_no + disk_no=${#disk_cfg[@]} + eval echo "Nvme{0..$((disk_no - 1))}n1" else - # Only target not mounted NVMes - for bdf in $(get_nvme_bdfs); do + # Find nvme block devices and only use the ones which + # are not mounted + for bdf in "${disk_cfg[@]}"; do if is_bdf_not_mounted $bdf; then local blkname blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}') @@ -168,74 +203,104 @@ function create_fio_config() { local disks_numa=($4) local cores=($5) local total_disks=${#disks[@]} - local no_cores=${#cores[@]} - local filename="" - + local fio_job_section=() + local num_cores=${#cores[@]} + local disks_per_core=$((disk_no / num_cores)) + local disks_per_core_mod=$((disk_no % num_cores)) local cores_numa - cores_numa=($(get_cores_numa_node "$5")) - local disks_per_core=$((disk_no / no_cores)) - local disks_per_core_mod=$((disk_no % no_cores)) - - # For kernel dirver, each disk will be alligned with all cpus on the same NUMA node - if [[ "$plugin" =~ "kernel" ]]; then - for ((i = 0; i < disk_no; i++)); do - sed -i -e "\$a[filename${i}]" $BASE_DIR/config.fio - filename="/dev/${disks[$i]}" - sed -i -e "\$afilename=$filename" $BASE_DIR/config.fio - cpu_used="" - for ((j = 0; j < no_cores; j++)); do - core_numa=${cores_numa[$j]} - if [ "${disks_numa[$i]}" = "$core_numa" ]; then - cpu_used+="${cores[$j]}," - fi - done - sed -i -e "\$acpus_allowed=$cpu_used" $BASE_DIR/config.fio - echo "" >> $BASE_DIR/config.fio - done - else - for ((i = 0; i < no_cores; i++)); do - core_numa=${cores_numa[$i]} - total_disks_per_core=$disks_per_core - if [ "$disks_per_core_mod" -gt "0" ]; then - total_disks_per_core=$((disks_per_core + 1)) - disks_per_core_mod=$((disks_per_core_mod - 1)) - fi + cores_numa=($(get_cores_numa_node "${cores[*]}")) + + # Following part of this function still leverages global variables a lot. + # It's a mix of local variables passed as aruments to function with global variables. This is messy. + # TODO: Modify this to be consistent with how variables are used here. Aim for using only + # local variables to get rid of globals as much as possible. + desc="\"Test io_plugin=$PLUGIN Blocksize=${BLK_SIZE} Workload=$RW MIX=${MIX} qd=${IODEPTH}\"" + cp "$testdir/config.fio.tmp" "$testdir/config.fio" + cat <<- EOF >> $testdir/config.fio + description=$desc + + rw=$RW + rwmixread=$MIX + bs=$BLK_SIZE + runtime=$RUNTIME + ramp_time=$RAMP_TIME + numjobs=$NUMJOBS + log_avg_msec=$SAMPLING_INT + EOF + + if $GTOD_REDUCE; then + echo "gtod_reduce=1" >> $testdir/config.fio + fi - if [ "$total_disks_per_core" = "0" ]; then - break - fi + if [[ "$IO_BATCH_SUBMIT" -gt 0 ]]; then + echo "iodepth_batch_submit=$IO_BATCH_SUBMIT" >> $testdir/config.fio + fi - sed -i -e "\$a[filename${i}]" $BASE_DIR/config.fio - #use cpus_allowed as cpumask works only for cores 1-32 - sed -i -e "\$acpus_allowed=${cores[$i]}" $BASE_DIR/config.fio - m=0 #counter of disks per cpu core numa - n=0 #counter of all disks - while [ "$m" -lt "$total_disks_per_core" ]; do - if [ ${disks_numa[$n]} = $core_numa ]; then - m=$((m + 1)) - if [[ "$plugin" = "spdk-plugin-nvme" ]]; then - filename='trtype=PCIe traddr='${disks[$n]//:/.}' ns=1' - elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then - filename=${disks[$n]} - fi - sed -i -e "\$afilename=$filename" $BASE_DIR/config.fio - #Mark numa of n'th disk as "x" to mark it as claimed - disks_numa[$n]="x" + if [[ "$IO_BATCH_COMPLETE" -gt 0 ]]; then + echo "iodepth_batch_complete=$IO_BATCH_COMPLETE" >> $testdir/config.fio + fi + + for i in "${!cores[@]}"; do + local m=0 #Counter of disks per NUMA node + local n=0 #Counter of all disks in test + core_numa=${cores_numa[$i]} + + total_disks_per_core=$disks_per_core + # Check how many "stray" disks are unassigned to CPU cores + # Assign one disk to current CPU core and substract it from the total of + # unassigned disks + if [[ "$disks_per_core_mod" -gt "0" ]]; then + total_disks_per_core=$((disks_per_core + 1)) + disks_per_core_mod=$((disks_per_core_mod - 1)) + fi + # SPDK fio plugin supports submitting/completing I/Os to multiple SSDs from a single thread. + # Therefore, the per thread queue depth is set to the desired IODEPTH/device X the number of devices per thread. + QD=$IODEPTH + if [[ "$NOIOSCALING" = false ]]; then + QD=$((IODEPTH * total_disks_per_core)) + fi + + fio_job_section+=("") + fio_job_section+=("[filename${i}]") + fio_job_section+=("iodepth=$QD") + fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}") + + while [[ "$m" -lt "$total_disks_per_core" ]]; do + # Try to add disks to job section if it's NUMA node matches NUMA + # for currently selected CPU + if [[ "${disks_numa[$n]}" == "$core_numa" ]]; then + if [[ "$plugin" == "spdk-plugin-nvme" ]]; then + fio_job_section+=("filename=trtype=PCIe traddr=${disks[$n]//:/.} ns=1 #NVMe NUMA Node ${disks_numa[$n]}") + elif [[ "$plugin" == "spdk-plugin-bdev" ]]; then + fio_job_section+=("filename=${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}") + elif [[ "$plugin" =~ "kernel" ]]; then + fio_job_section+=("filename=/dev/${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}") fi - n=$((n + 1)) - # If there is no more disks with numa node same as cpu numa node, switch to other numa node. - if [ $n -ge $total_disks ]; then - if [ "$core_numa" = "1" ]; then - core_numa=0 - else - core_numa=1 - fi - n=0 + m=$((m + 1)) + + #Mark numa of n'th disk as "x" to mark it as claimed for next loop iterations + disks_numa[$n]="x" + fi + n=$((n + 1)) + + # If there is no more disks with numa node same as cpu numa node, switch to + # other numa node, go back to start of loop and try again. + if [[ $n -ge $total_disks ]]; then + echo "WARNING! Cannot assign any more NVMes for CPU ${cores[$i]}" + echo "NVMe assignment for this CPU will be cross-NUMA." + if [[ "$core_numa" == "1" ]]; then + core_numa=0 + else + core_numa=1 fi - done - echo "" >> $BASE_DIR/config.fio + n=0 + fi done - fi + done + + printf "%s\n" "${fio_job_section[@]}" >> $testdir/config.fio + echo "INFO: Generated fio configuration file:" + cat $testdir/config.fio } function preconditioning() { @@ -243,9 +308,9 @@ function preconditioning() { local filename="" local nvme_list - HUGEMEM=8192 $ROOT_DIR/scripts/setup.sh - cp $BASE_DIR/config.fio.tmp $BASE_DIR/config.fio - echo "[Preconditioning]" >> $BASE_DIR/config.fio + HUGEMEM=8192 $rootdir/scripts/setup.sh + cp $testdir/config.fio.tmp $testdir/config.fio + echo "[Preconditioning]" >> $testdir/config.fio # Generate filename argument for FIO. # We only want to target NVMes not bound to nvme driver. @@ -259,70 +324,41 @@ function preconditioning() { echo "** Preconditioning disks, this can take a while, depending on the size of disks." run_spdk_nvme_fio "spdk-plugin-nvme" --filename="$filename" --size=100% --loops=2 --bs=1M \ --rw=write --iodepth=32 --output-format=normal - rm -f $BASE_DIR/config.fio + rm -f $testdir/config.fio +} + +function bc() { + $(type -P bc) -l <<< "scale=3; $1" } function get_results() { - local reads_pct=$2 - local writes_pct=$((100 - $2)) - - case "$1" in - iops) - iops=$(jq -r '.jobs[] | (.read.iops + .write.iops)' $NVME_FIO_RESULTS) - iops=${iops%.*} - echo $iops - ;; - mean_lat_usec) - mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)" $NVME_FIO_RESULTS) - mean_lat=${mean_lat%.*} - echo $((mean_lat / 100000)) - ;; - p99_lat_usec) - p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\" * $reads_pct + .write.clat_ns.percentile.\"99.000000\" * $writes_pct)" $NVME_FIO_RESULTS) - p99_lat=${p99_lat%.*} - echo $((p99_lat / 100000)) - ;; - p99_99_lat_usec) - p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" * $reads_pct + .write.clat_ns.percentile.\"99.990000\" * $writes_pct)" $NVME_FIO_RESULTS) - p99_99_lat=${p99_99_lat%.*} - echo $((p99_99_lat / 100000)) - ;; - stdev_usec) - stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)" $NVME_FIO_RESULTS) - stdev=${stdev%.*} - echo $((stdev / 100000)) - ;; - mean_slat_usec) - mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)" $NVME_FIO_RESULTS) - mean_slat=${mean_slat%.*} - echo $((mean_slat / 100000)) - ;; - mean_clat_usec) - mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)" $NVME_FIO_RESULTS) - mean_clat=${mean_clat%.*} - echo $((mean_clat / 100000)) - ;; - bw_Kibs) - bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $NVME_FIO_RESULTS) - bw=${bw%.*} - echo $((bw)) - ;; - esac + local iops bw stdev + local p90_lat p99_lat p99_99_lat + local mean_slat mean_clat + local reads_pct + local writes_pct + + reads_pct=$(bc "$1 / 100") + writes_pct=$(bc "1 - $reads_pct") + + iops=$(jq -r '.jobs[] | .read.iops + .write.iops' $TMP_RESULT_FILE) + bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $TMP_RESULT_FILE) + mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE) + p90_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"90.000000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"90.000000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE) + p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.000000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE) + p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.990000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE) + stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)/1000" $TMP_RESULT_FILE) + mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE) + mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE) + + echo "$iops $bw $mean_lat $p90_lat $p99_lat $p99_99_lat $stdev $mean_slat $mean_clat" } function get_bdevperf_results() { - case "$1" in - iops) - iops=$(grep Total $NVME_FIO_RESULTS | awk -F 'Total' '{print $2}' | awk '{print $2}') - iops=${iops%.*} - echo $iops - ;; - bw_Kibs) - bw_MBs=$(grep Total $NVME_FIO_RESULTS | awk -F 'Total' '{print $2}' | awk '{print $4}') - bw_MBs=${bw_MBs%.*} - echo $((bw_MBs * 1024)) - ;; - esac + local iops + local bw_MBs + read -r iops bw_MBs <<< $(grep Total $TMP_RESULT_FILE | tr -s " " | awk -F ":| " '{print $5" "$7}') + echo "$iops $(bc "$bw_MBs * 1024")" } function get_nvmeperf_results() { @@ -332,26 +368,17 @@ function get_nvmeperf_results() { local max_lat_usec local min_lat_usec - read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $NVME_FIO_RESULTS | grep -oP "(?<=Total : )(.*+)") - - # We need to get rid of the decimal spaces due - # to use of arithmetic expressions instead of "bc" for calculations - iops=${iops%.*} - bw_MBs=${bw_MBs%.*} - mean_lat_usec=${mean_lat_usec%.*} - min_lat_usec=${min_lat_usec%.*} - max_lat_usec=${max_lat_usec%.*} - - echo "$iops $(bc <<< "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec" + read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $TMP_RESULT_FILE | grep -oP "(?<=Total : )(.*+)") + echo "$iops $(bc "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec" } function run_spdk_nvme_fio() { local plugin=$1 echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting." if [[ "$plugin" = "spdk-plugin-nvme" ]]; then - LD_PRELOAD=$PLUGIN_DIR/spdk_nvme $FIO_BIN $BASE_DIR/config.fio --output-format=json "${@:2}" --ioengine=spdk + LD_PRELOAD=$plugin_dir/spdk_nvme $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then - LD_PRELOAD=$PLUGIN_DIR/spdk_bdev $FIO_BIN $BASE_DIR/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$BASE_DIR/bdev.conf --spdk_mem=4096 + LD_PRELOAD=$plugin_dir/spdk_bdev $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$testdir/bdev.conf --spdk_mem=4096 fi sleep 1 @@ -359,13 +386,13 @@ function run_spdk_nvme_fio() { function run_nvme_fio() { echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting." - $FIO_BIN $BASE_DIR/config.fio --output-format=json "$@" + $FIO_BIN $testdir/config.fio --output-format=json "$@" sleep 1 } function run_bdevperf() { echo "** Running bdevperf test, this can take a while, depending on the run-time setting." - $BDEVPERF_DIR/bdevperf --json $BASE_DIR/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]" + $bdevperf_dir/bdevperf --json $testdir/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]" -r /var/tmp/spdk.sock sleep 1 } @@ -382,7 +409,7 @@ function run_nvmeperf() { echo "** Running nvme perf test, this can take a while, depending on the run-time setting." # Run command in separate shell as this solves quoting issues related to r_opt var - $SHELL -c "$NVMEPERF_DIR/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]" + $SHELL -c "$nvmeperf_dir/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]" sleep 1 } @@ -402,14 +429,6 @@ function wait_for_nvme_reload() { function verify_disk_number() { # Check if we have appropriate number of disks to carry out the test - if [[ "$PLUGIN" =~ "bdev" ]]; then - cat <<- JSON > "$BASE_DIR/bdev.conf" - {"subsystems":[ - $("$ROOT_DIR/scripts/gen_nvme.sh" --json) - ]} - JSON - fi - disks=($(get_disks $PLUGIN)) if [[ $DISKNO == "ALL" ]] || [[ $DISKNO == "all" ]]; then DISKNO=${#disks[@]} @@ -418,91 +437,3 @@ function verify_disk_number() { false fi } - -function usage() { - set +x - [[ -n $2 ]] && ( - echo "$2" - echo "" - ) - echo "Run NVMe PMD/BDEV performance test. Change options for easier debug and setup configuration" - echo "Usage: $(basename $1) [options]" - echo "-h, --help Print help and exit" - echo - echo "Workload parameters:" - echo " --rw=STR Type of I/O pattern. Accepted values are randrw,rw. [default=$RW]" - echo " --rwmixread=INT Percentage of a mixed workload that should be reads. [default=$MIX]" - echo " --iodepth=INT Number of I/Os to keep in flight against the file. [default=$IODEPTH]" - echo " --block-size=INT The block size in bytes used for I/O units. [default=$BLK_SIZE]" - echo " --run-time=TIME[s] Tell fio to run the workload for the specified period of time. [default=$RUNTIME]" - echo " --ramp-time=TIME[s] Fio will run the specified workload for this amount of time before" - echo " logging any performance numbers. [default=$RAMP_TIME]. Applicable only for fio-based tests." - echo " --numjobs=INT Create the specified number of clones of this job. [default=$NUMJOBS]" - echo " Applicable only for fio-based tests." - echo " --repeat-no=INT How many times to repeat workload test. [default=$REPEAT_NO]" - echo " Test result will be an average of repeated test runs." - echo " --fio-bin=PATH Path to fio binary. [default=$FIO_BIN]" - echo " Applicable only for fio-based tests." - echo - echo "Test setup parameters:" - echo " --driver=STR Selects tool used for testing. Choices available:" - echo " - spdk-perf-nvme (SPDK nvme perf)" - echo " - spdk-perf-bdev (SPDK bdev perf)" - echo " - spdk-plugin-nvme (SPDK nvme fio plugin)" - echo " - spdk-plugin-bdev (SPDK bdev fio plugin)" - echo " - kernel-classic-polling" - echo " - kernel-hybrid-polling" - echo " - kernel-libaio" - echo " - kernel-io-uring" - echo " --disk-no=INT,ALL Number of disks to test on, this will run one workload on selected number od disks," - echo " it discards max-disk setting, if =ALL then test on all found disk. [default=$DISKNO]" - echo " --max-disk=INT,ALL Number of disks to test on, this will run multiple workloads with increasing number of disk each run." - echo " If =ALL then test on all found disk. [default=$DISKNO]" - echo " --cpu-allowed=INT Comma-separated list of CPU cores used to run the workload. [default=$CPUS_ALLOWED]" - echo " --no-preconditioning Skip preconditioning" - echo " --no-io-scaling Do not scale iodepth for each device in SPDK fio plugin. [default=$NOIOSCALING]" - set -x -} - -while getopts 'h-:' optchar; do - case "$optchar" in - -) - case "$OPTARG" in - help) - usage $0 - exit 0 - ;; - rw=*) RW="${OPTARG#*=}" ;; - rwmixread=*) MIX="${OPTARG#*=}" ;; - iodepth=*) IODEPTH="${OPTARG#*=}" ;; - block-size=*) BLK_SIZE="${OPTARG#*=}" ;; - run-time=*) RUNTIME="${OPTARG#*=}" ;; - ramp-time=*) RAMP_TIME="${OPTARG#*=}" ;; - numjobs=*) NUMJOBS="${OPTARG#*=}" ;; - repeat-no=*) REPEAT_NO="${OPTARG#*=}" ;; - fio-bin=*) FIO_BIN="${OPTARG#*=}" ;; - driver=*) PLUGIN="${OPTARG#*=}" ;; - disk-no=*) - DISKNO="${OPTARG#*=}" - ONEWORKLOAD=true - ;; - max-disk=*) DISKNO="${OPTARG#*=}" ;; - cpu-allowed=*) CPUS_ALLOWED="${OPTARG#*=}" ;; - no-preconditioning) PRECONDITIONING=false ;; - no-io-scaling) NOIOSCALING=true ;; - *) - usage $0 echo "Invalid argument '$OPTARG'" - exit 1 - ;; - esac - ;; - h) - usage $0 - exit 0 - ;; - *) - usage $0 "Invalid argument '$optchar'" - exit 1 - ;; - esac -done diff --git a/test/nvme/perf/config.fio.tmp b/test/nvme/perf/config.fio.tmp index 66f29faad52..dfaea5df59c 100644 --- a/test/nvme/perf/config.fio.tmp +++ b/test/nvme/perf/config.fio.tmp @@ -1,5 +1,6 @@ [global] -thread=1 -group_reporting=1 direct=1 +thread=1 norandommap=1 +group_reporting=1 +time_based=1 diff --git a/test/nvme/perf/run_perf.sh b/test/nvme/perf/run_perf.sh index 8e4954f5568..05692a2860d 100755 --- a/test/nvme/perf/run_perf.sh +++ b/test/nvme/perf/run_perf.sh @@ -1,42 +1,186 @@ #!/usr/bin/env bash +set -e -# Automated script that runs NVMe PMD/BDEV performance test. -# This script should be run as root. Please run the scripts/setup.sh before running this script to bind disks to VFIO/UIO driver -# This script takes the following parameters: -# "--run-time" - the run time for the workload in seconds -# "--ramp-time" - Fio will run the specified workload for this amount of time before logging any performance numbers -# "--cpu-allowed" - A comma-separated list of CPU cores used to run the workload. - When the spdk fio plugin is chosen, NVMe devices will -# be aligned to specific core according to their NUMA node. The script will try to align each core with devices matching core's -# on the same NUMA node first but if there are no devices left in the same NUMA node as the CPU Core then it will use devices on the other NUMA node. -# It is important to choose cores that will ensure best NUMA node allocation. For example, on a system with 8 devices on NUMA node -# 0 and 8 devices on NUMA node 1, cores 0-27 on numa node 0 and 28-55 on numa node 1, if test uses 16 disk and four cores -# then "--cpu-allowed=1,2,28,29" results in a NUMA-balanced configuration with 4 devices on each CPU core. -# However, if the test will use 10 CPU cores, then best option would be "--cpu-allowed=1,2,3,4,28,29,30,31,32,33" because cores 1-4 will be aligned with -# 2 devices on numa0 per core, cores 28-29 will be aligned with 2 devices on numa1 per core and cores 30-33 with 1 device on numa1 per core. -# "--iodepth" - Number of I/Os to keep in flight per devices for SPDK fio_plugin and per job for kernel driver. -# "--driver" - "This parameter is used to set the ioengine and other fio parameters that determine how fio jobs issue I/O. SPDK supports two modes (nvme and bdev): to use the SPDK BDEV fio plugin set the value to bdev, set the value to nvme to use the SPDK NVME PMD. -# "There are 4 modes available for Linux Kernel driver: set the value to kernel-libaio to use the Linux asynchronous I/O engine, -# set the value to kernel-classic-polling to use the pvsynch2 ioengine in classic polling mode (100% load on the polling CPU core), -# set the value to kernel-hybrid-polling to use the pvsynch2 ioengine in hybrid polling mode where the polling thread sleeps for half the mean device execution time, -# set the value to kernel-io-uring to use io_uring engine. -# "--no-preconditioning" - skip preconditioning - Normally the script will precondition disks to put them in a steady state. -# However, preconditioning could be skipped, for example preconditiong has been already made and workload was 100% reads. -# "--disk-no" - use specified number of disks for test. -# "--repeat-no" Repeat each workolad specified number of times. -# "--numjobs" - Number of fio threads running the workload. -# "--no-io-scaling" - Set number of iodepth to be per job instead per device for SPDK fio_plugin. -# An Example Performance Test Run -# "./spdk/test/perf/run_perf.sh --run-time=600 --ramp-time=60 --cpu-allowed=28 --fio-bin=/usr/src/fio/fio\ -# --rwmixread=100 --iodepth=256 --fio-plugin=bdev --no-preconditioning --disk-no=6" -# This command will run test using fio plugin for 600 seconds, 60 sec of ram time, randrw job with -# 100% reads with io depth 256 per disk, on 6 devices and skips preconditioning. Cpu core used for this test is -# core no 28. -BASE_DIR=$(readlink -f $(dirname $0)) -. $BASE_DIR/common.sh - -trap 'rm -f *.state $BASE_DIR/bdev.conf; print_backtrace' ERR SIGTERM SIGABRT -verify_disk_number +# Dir variables and sourcing common files +testdir=$(readlink -f $(dirname $0)) +rootdir=$(readlink -f $testdir/../../..) +plugin_dir=$rootdir/build/fio +bdevperf_dir=$rootdir/test/bdev/bdevperf +nvmeperf_dir=$rootdir/build/examples +source $testdir/common.sh +source $rootdir/scripts/common.sh || exit 1 +source $rootdir/test/common/autotest_common.sh + +# Global & default variables +declare -A KERNEL_ENGINES +KERNEL_ENGINES=( + ["kernel-libaio"]="--ioengine=libaio" + ["kernel-classic-polling"]="--ioengine=pvsync2 --hipri=100" + ["kernel-hybrid-polling"]="--ioengine=pvsync2 --hipri=100" + ["kernel-io-uring"]="--ioengine=io_uring") + +RW=randrw +MIX=100 +IODEPTH=256 +BLK_SIZE=4096 +RUNTIME=600 +RAMP_TIME=30 +NUMJOBS=1 +REPEAT_NO=3 +GTOD_REDUCE=false +SAMPLING_INT=0 +IO_BATCH_SUBMIT=0 +IO_BATCH_COMPLETE=0 +FIO_BIN=$CONFIG_FIO_SOURCE_DIR/fio +TMP_RESULT_FILE=$testdir/result.json +PLUGIN="nvme" +DISKCFG="" +BDEV_CACHE="" +BDEV_POOL="" +DISKNO="ALL" +CPUS_ALLOWED=1 +NOIOSCALING=false +PRECONDITIONING=true +CPUFREQ="" +PERFTOP=false +DPDKMEM=false +DATE="$(date +'%m_%d_%Y_%H%M%S')" + +function usage() { + set +x + [[ -n $2 ]] && ( + echo "$2" + echo "" + ) + echo "Run NVMe PMD/BDEV performance test. Change options for easier debug and setup configuration" + echo "Usage: $(basename $1) [options]" + echo "-h, --help Print help and exit" + echo + echo "Workload parameters:" + echo " --rw=STR Type of I/O pattern. Accepted values are randrw,rw. [default=$RW]" + echo " --rwmixread=INT Percentage of a mixed workload that should be reads. [default=$MIX]" + echo " --iodepth=INT Number of I/Os to keep in flight against the file. [default=$IODEPTH]" + echo " --block-size=INT The block size in bytes used for I/O units. [default=$BLK_SIZE]" + echo " --run-time=TIME[s] Tell fio to run the workload for the specified period of time. [default=$RUNTIME]" + echo " --ramp-time=TIME[s] Fio will run the specified workload for this amount of time before" + echo " logging any performance numbers. [default=$RAMP_TIME]. Applicable only for fio-based tests." + echo " --numjobs=INT Create the specified number of clones of this job. [default=$NUMJOBS]" + echo " Applicable only for fio-based tests." + echo " --repeat-no=INT How many times to repeat workload test. [default=$REPEAT_NO]" + echo " Test result will be an average of repeated test runs." + echo " --gtod-reduce Enable fio gtod_reduce option. [default=$GTOD_REDUCE]" + echo " --sampling-int=INT Value for fio log_avg_msec parameters [default=$SAMPLING_INT]" + echo " --io-batch-submit=INT Value for iodepth_batch_submit fio option [default=$IO_BATCH_SUBMIT]" + echo " --io-batch-complete=INT Value for iodepth_batch_complete fio option [default=$IO_BATCH_COMPLETE]" + echo " --fio-bin=PATH Path to fio binary. [default=$FIO_BIN]" + echo " Applicable only for fio-based tests." + echo + echo "Test setup parameters:" + echo " --driver=STR Selects tool used for testing. Choices available:" + echo " - spdk-perf-nvme (SPDK nvme perf)" + echo " - spdk-perf-bdev (SPDK bdev perf)" + echo " - spdk-plugin-nvme (SPDK nvme fio plugin)" + echo " - spdk-plugin-bdev (SPDK bdev fio plugin)" + echo " - kernel-classic-polling" + echo " - kernel-hybrid-polling" + echo " - kernel-libaio" + echo " - kernel-io-uring" + echo " --disk-config Configuration file containing PCI BDF addresses of NVMe disks to use in test." + echo " It consists a single column of PCI addresses. SPDK Bdev names will be assigned" + echo " and Kernel block device names detected." + echo " Lines starting with # are ignored as comments." + echo " --bdev-io-cache-size Set IO cache size for for SPDK bdev subsystem." + echo " --bdev-io-pool-size Set IO pool size for for SPDK bdev subsystem." + echo " --max-disk=INT,ALL Number of disks to test on, this will run multiple workloads with increasing number of disk each run." + echo " If =ALL then test on all found disk. [default=$DISKNO]" + echo " --cpu-allowed=INT/PATH Comma-separated list of CPU cores used to run the workload. Ranges allowed." + echo " Can also point to a file containing list of CPUs. [default=$CPUS_ALLOWED]" + echo " --no-preconditioning Skip preconditioning" + echo " --no-io-scaling Do not scale iodepth for each device in SPDK fio plugin. [default=$NOIOSCALING]" + echo " --cpu-frequency=INT Run tests with CPUs set to a desired frequency. 'intel_pstate=disable' must be set in" + echo " GRUB options. You can use 'cpupower frequency-info' and 'cpupower frequency-set' to" + echo " check list of available frequencies. Example: --cpu-frequency=1100000." + echo + echo "Other options:" + echo " --perftop Run perftop measurements on the same CPU cores as specified in --cpu-allowed option." + echo " --dpdk-mem-stats Dump DPDK memory stats during the test." + set -x +} + +while getopts 'h-:' optchar; do + case "$optchar" in + -) + case "$OPTARG" in + help) + usage $0 + exit 0 + ;; + rw=*) RW="${OPTARG#*=}" ;; + rwmixread=*) MIX="${OPTARG#*=}" ;; + iodepth=*) IODEPTH="${OPTARG#*=}" ;; + block-size=*) BLK_SIZE="${OPTARG#*=}" ;; + run-time=*) RUNTIME="${OPTARG#*=}" ;; + ramp-time=*) RAMP_TIME="${OPTARG#*=}" ;; + numjobs=*) NUMJOBS="${OPTARG#*=}" ;; + repeat-no=*) REPEAT_NO="${OPTARG#*=}" ;; + gtod-reduce) GTOD_REDUCE=true ;; + sampling-int=*) SAMPLING_INT="${OPTARG#*=}" ;; + io-batch-submit=*) IO_BATCH_SUBMIT="${OPTARG#*=}" ;; + io-batch-complete=*) IO_BATCH_COMPLETE="${OPTARG#*=}" ;; + fio-bin=*) FIO_BIN="${OPTARG#*=}" ;; + driver=*) PLUGIN="${OPTARG#*=}" ;; + disk-config=*) + DISKCFG="${OPTARG#*=}" + if [[ ! -f "$DISKCFG" ]]; then + echo "Disk confiuration file $DISKCFG does not exist!" + exit 1 + fi + ;; + bdev-io-cache-size=*) BDEV_CACHE="${OPTARG#*=}" ;; + bdev-io-pool-size=*) BDEV_POOL="${OPTARG#*=}" ;; + max-disk=*) DISKNO="${OPTARG#*=}" ;; + cpu-allowed=*) + CPUS_ALLOWED="${OPTARG#*=}" + if [[ -f "$CPUS_ALLOWED" ]]; then + CPUS_ALLOWED=$(cat "$CPUS_ALLOWED") + fi + ;; + no-preconditioning) PRECONDITIONING=false ;; + no-io-scaling) NOIOSCALING=true ;; + cpu-frequency=*) CPUFREQ="${OPTARG#*=}" ;; + perftop) PERFTOP=true ;; + dpdk-mem-stats) DPDKMEM=true ;; + *) + usage $0 echo "Invalid argument '$OPTARG'" + exit 1 + ;; + esac + ;; + h) + usage $0 + exit 0 + ;; + *) + usage $0 "Invalid argument '$optchar'" + exit 1 + ;; + esac +done + +result_dir=$testdir/results/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE} +result_file=$result_dir/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.csv +mkdir -p $result_dir +unset iops_disks bw mean_lat_disks_usec p90_lat_disks_usec p99_lat_disks_usec p99_99_lat_disks_usec stdev_disks_usec +echo "run-time,ramp-time,fio-plugin,QD,block-size,num-cpu-cores,workload,workload-mix" > $result_file +printf "%s,%s,%s,%s,%s,%s,%s,%s\n" $RUNTIME $RAMP_TIME $PLUGIN $IODEPTH $BLK_SIZE $NO_CORES $RW $MIX >> $result_file +echo "num_of_disks,iops,avg_lat[usec],p90[usec],p99[usec],p99.99[usec],stdev[usec],avg_slat[usec],avg_clat[usec],bw[Kib/s]" >> $result_file + +trap 'rm -f *.state $testdir/bdev.conf; kill $perf_pid; wait $dpdk_mem_pid; print_backtrace' ERR SIGTERM SIGABRT +if [[ "$PLUGIN" =~ "bdev" ]]; then + create_spdk_bdev_conf "$BDEV_CACHE" "$BDEV_POOL" +fi +verify_disk_number DISK_NAMES=$(get_disks $PLUGIN) DISKS_NUMA=$(get_numa_node $PLUGIN "$DISK_NAMES") CORES=$(get_cores "$CPUS_ALLOWED") @@ -48,7 +192,7 @@ if $PRECONDITIONING; then fi if [[ "$PLUGIN" =~ "kernel" ]]; then - $ROOT_DIR/scripts/setup.sh reset + $rootdir/scripts/setup.sh reset fio_ioengine_opt="${KERNEL_ENGINES[$PLUGIN]}" if [[ $PLUGIN = "kernel-classic-polling" ]]; then @@ -88,130 +232,140 @@ if [[ "$PLUGIN" =~ "kernel" ]]; then fi fi -result_dir=$BASE_DIR/results/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE} -result_file=$result_dir/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.csv -mkdir -p $result_dir -unset iops_disks bw mean_lat_disks_usec p99_lat_disks_usec p99_99_lat_disks_usec stdev_disks_usec -echo "run-time,ramp-time,fio-plugin,QD,block-size,num-cpu-cores,workload,workload-mix" > $result_file -printf "%s,%s,%s,%s,%s,%s,%s,%s\n" $RUNTIME $RAMP_TIME $PLUGIN $IODEPTH $BLK_SIZE $NO_CORES $RW $MIX >> $result_file -echo "num_of_disks,iops,avg_lat[usec],p99[usec],p99.99[usec],stdev[usec],avg_slat[usec],avg_clat[usec],bw[Kib/s]" >> $result_file +if [[ -n "$CPUFREQ" ]]; then + if [[ ! "$(cat /proc/cmdline)" =~ "intel_pstate=disable" ]]; then + echo "ERROR: Cannot set custom CPU frequency for test. intel_pstate=disable not in boot options." + false + else + cpu_governor="$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)" + cpupower frequency-set -g userspace + cpupower frequency-set -f $CPUFREQ + fi +fi + +if $PERFTOP; then + echo "INFO: starting perf record on cores $CPUS_ALLOWED" + perf record -C $CPUS_ALLOWED -o "$testdir/perf.data" & + perf_pid=$! +fi + +if $DPDKMEM; then + echo "INFO: waiting to generate DPDK memory usage" + wait_time=$((RUNTIME / 2)) + if [[ ! "$PLUGIN" =~ "perf" ]]; then + wait_time=$((wait_time + RAMP_TIME)) + fi + ( + sleep $wait_time + echo "INFO: generating DPDK memory usage" + $rootdir/scripts/rpc.py env_dpdk_get_mem_stats + ) & + dpdk_mem_pid=$! +fi + +iops_disks=0 +bw=0 +min_lat_disks_usec=0 +max_lat_disks_usec=0 +mean_lat_disks_usec=0 +p90_lat_disks_usec=0 +p99_lat_disks_usec=0 +p99_99_lat_disks_usec=0 +stdev_disks_usec=0 +mean_slat_disks_usec=0 +mean_clat_disks_usec=0 #Run each workolad $REPEAT_NO times for ((j = 0; j < REPEAT_NO; j++)); do - #Start with $DISKNO disks and remove 2 disks for each run to avoid preconditioning before each run. - for ((k = DISKNO; k >= 1; k -= 2)); do - cp $BASE_DIR/config.fio.tmp $BASE_DIR/config.fio - echo "" >> $BASE_DIR/config.fio - #The SPDK fio plugin supports submitting/completing I/Os to multiple SSDs from a single thread. - #Therefore, the per thread queue depth is set to the desired IODEPTH/device X the number of devices per thread. - if [[ "$PLUGIN" =~ "spdk-plugin" ]] && [[ "$NOIOSCALING" = false ]]; then - qd=$((IODEPTH * k)) - else - qd=$IODEPTH - fi + if [ $PLUGIN = "spdk-perf-bdev" ]; then + run_bdevperf > $TMP_RESULT_FILE + read -r iops bandwidth <<< $(get_bdevperf_results) + iops_disks=$(bc "$iops_disks + $iops") + bw=$(bc "$bw + $bandwidth") + cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output + elif [ $PLUGIN = "spdk-perf-nvme" ]; then + run_nvmeperf $DISKNO > $TMP_RESULT_FILE + read -r iops bandwidth mean_lat min_lat max_lat <<< $(get_nvmeperf_results) + + iops_disks=$(bc "$iops_disks+$iops") + bw=$(bc "$bw+$bandwidth") + mean_lat_disks_usec=$(bc "$mean_lat_disks_usec + $mean_lat") + min_lat_disks_usec=$(bc "$min_lat_disks_usec + $min_lat") + max_lat_disks_usec=$(bc "$max_lat_disks_usec + $max_lat") - if [ $PLUGIN = "spdk-perf-bdev" ]; then - run_bdevperf > $NVME_FIO_RESULTS - iops_disks[$k]=$((${iops_disks[$k]} + $(get_bdevperf_results iops))) - bw[$k]=$((${bw[$k]} + $(get_bdevperf_results bw_Kibs))) - cp $NVME_FIO_RESULTS $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output - elif [ $PLUGIN = "spdk-perf-nvme" ]; then - run_nvmeperf $k > $NVME_FIO_RESULTS - read -r iops bandwidth mean_lat min_lat max_lat <<< $(get_nvmeperf_results) - - iops_disks[$k]=$((${iops_disks[$k]} + iops)) - bw[$k]=$((${bw[$k]} + bandwidth)) - mean_lat_disks_usec[$k]=$((${mean_lat_disks_usec[$k]} + mean_lat)) - min_lat_disks_usec[$k]=$((${min_lat_disks_usec[$k]} + min_lat)) - max_lat_disks_usec[$k]=$((${max_lat_disks_usec[$k]} + max_lat)) - - cp $NVME_FIO_RESULTS $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output + cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output + else + create_fio_config $DISKNO $PLUGIN "$DISK_NAMES" "$DISKS_NUMA" "$CORES" + + if [[ "$PLUGIN" =~ "spdk-plugin" ]]; then + run_spdk_nvme_fio $PLUGIN "--output=$TMP_RESULT_FILE" \ + "--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}" else - create_fio_config $k $PLUGIN "$DISK_NAMES" "$DISKS_NUMA" "$CORES" - desc="Running Test: Blocksize=${BLK_SIZE} Workload=$RW MIX=${MIX} qd=${IODEPTH} io_plugin/driver=$PLUGIN" - - cat <<- EOF >> $BASE_DIR/config.fio - rw=$RW - rwmixread=$MIX - iodepth=$qd - bs=$BLK_SIZE - runtime=$RUNTIME - ramp_time=$RAMP_TIME - numjobs=$NUMJOBS - time_based=1 - description=$desc - log_avg_msec=250 - EOF - - echo "USING CONFIG:" - cat $BASE_DIR/config.fio - - if [[ "$PLUGIN" =~ "spdk-plugin" ]]; then - run_spdk_nvme_fio $PLUGIN "--output=$NVME_FIO_RESULTS" \ - "--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}" - else - run_nvme_fio $fio_ioengine_opt "--output=$NVME_FIO_RESULTS" \ - "--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}" - fi - - #Store values for every number of used disks - iops_disks[$k]=$((${iops_disks[$k]} + $(get_results iops $MIX))) - mean_lat_disks_usec[$k]=$((${mean_lat_disks_usec[$k]} + $(get_results mean_lat_usec $MIX))) - p99_lat_disks_usec[$k]=$((${p99_lat_disks_usec[$k]} + $(get_results p99_lat_usec $MIX))) - p99_99_lat_disks_usec[$k]=$((${p99_99_lat_disks_usec[$k]} + $(get_results p99_99_lat_usec $MIX))) - stdev_disks_usec[$k]=$((${stdev_disks_usec[$k]} + $(get_results stdev_usec $MIX))) - - mean_slat_disks_usec[$k]=$((${mean_slat_disks_usec[$k]} + $(get_results mean_slat_usec $MIX))) - mean_clat_disks_usec[$k]=$((${mean_clat_disks_usec[$k]} + $(get_results mean_clat_usec $MIX))) - bw[$k]=$((${bw[$k]} + $(get_results bw_Kibs $MIX))) - cp $NVME_FIO_RESULTS $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.json - cp $BASE_DIR/config.fio $result_dir/config_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.fio - rm -f $BASE_DIR/config.fio + run_nvme_fio $fio_ioengine_opt "--output=$TMP_RESULT_FILE" \ + "--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}" fi - #if tested on only one number of disk - if $ONEWORKLOAD; then - break + #Store values for every number of used disks + #Use recalculated value for mixread param in case rw mode is not rw. + rwmixread=$MIX + if [[ $RW = *"read"* ]]; then + rwmixread=100 + elif [[ $RW = *"write"* ]]; then + rwmixread=0 fi - done -done -#Write results to csv file -for ((k = DISKNO; k >= 1; k -= 2)); do - iops_disks[$k]=$((${iops_disks[$k]} / REPEAT_NO)) - - if [[ "$PLUGIN" =~ "plugin" ]]; then - mean_lat_disks_usec[$k]=$((${mean_lat_disks_usec[$k]} / REPEAT_NO)) - p99_lat_disks_usec[$k]=$((${p99_lat_disks_usec[$k]} / REPEAT_NO)) - p99_99_lat_disks_usec[$k]=$((${p99_99_lat_disks_usec[$k]} / REPEAT_NO)) - stdev_disks_usec[$k]=$((${stdev_disks_usec[$k]} / REPEAT_NO)) - mean_slat_disks_usec[$k]=$((${mean_slat_disks_usec[$k]} / REPEAT_NO)) - mean_clat_disks_usec[$k]=$((${mean_clat_disks_usec[$k]} / REPEAT_NO)) - elif [[ "$PLUGIN" == "spdk-perf-bdev" ]]; then - mean_lat_disks_usec[$k]=0 - p99_lat_disks_usec[$k]=0 - p99_99_lat_disks_usec[$k]=0 - stdev_disks_usec[$k]=0 - mean_slat_disks_usec[$k]=0 - mean_clat_disks_usec[$k]=0 - elif [[ "$PLUGIN" == "spdk-perf-nvme" ]]; then - mean_lat_disks_usec[$k]=$((${mean_lat_disks_usec[$k]} / REPEAT_NO)) - p99_lat_disks_usec[$k]=0 - p99_99_lat_disks_usec[$k]=0 - stdev_disks_usec[$k]=0 - mean_slat_disks_usec[$k]=0 - mean_clat_disks_usec[$k]=0 - fi - bw[$k]=$((${bw[$k]} / REPEAT_NO)) + read -r iops bandwidth mean_lat_usec p90_lat_usec p99_lat_usec p99_99_lat_usec \ + stdev_usec mean_slat_usec mean_clat_usec <<< $(get_results $rwmixread) + iops_disks=$(bc "$iops_disks + $iops") + mean_lat_disks_usec=$(bc "$mean_lat_disks_usec + $mean_lat_usec") + p90_lat_disks_usec=$(bc "$p90_lat_disks_usec + $p90_lat_usec") + p99_lat_disks_usec=$(bc "$p99_lat_disks_usec + $p99_lat_usec") + p99_99_lat_disks_usec=$(bc "$p99_99_lat_disks_usec + $p99_99_lat_usec") + stdev_disks_usec=$(bc "$stdev_disks_usec + $stdev_usec") + mean_slat_disks_usec=$(bc "$mean_slat_disks_usec + $mean_slat_usec") + mean_clat_disks_usec=$(bc "$mean_clat_disks_usec + $mean_clat_usec") + bw=$(bc "$bw + $bandwidth") - printf "%s,%s,%s,%s,%s,%s,%s,%s,%s\n" ${k} ${iops_disks[$k]} ${mean_lat_disks_usec[$k]} ${p99_lat_disks_usec[$k]} \ - ${p99_99_lat_disks_usec[$k]} ${stdev_disks_usec[$k]} ${mean_slat_disks_usec[$k]} ${mean_clat_disks_usec[$k]} ${bw[$k]} >> $result_file - - #if tested on only one numeber of disk - if $ONEWORKLOAD; then - break + cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.json + cp $testdir/config.fio $result_dir/config_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.fio + rm -f $testdir/config.fio fi done +if $PERFTOP; then + echo "INFO: Stopping perftop measurements." + kill $perf_pid + wait $perf_pid || true + perf report -i "$testdir/perf.data" > $result_dir/perftop_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt + rm -f "$testdir/perf.data" +fi + +if $DPDKMEM; then + mv "/tmp/spdk_mem_dump.txt" $result_dir/spdk_mem_dump_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt + echo "INFO: DPDK memory usage saved in $result_dir" +fi + +#Write results to csv file +iops_disks=$(bc "$iops_disks / $REPEAT_NO") +bw=$(bc "$bw / $REPEAT_NO") +if [[ "$PLUGIN" =~ "plugin" ]] || [[ "$PLUGIN" =~ "kernel" ]]; then + mean_lat_disks_usec=$(bc "$mean_lat_disks_usec / $REPEAT_NO") + p90_lat_disks_usec=$(bc "$p90_lat_disks_usec / $REPEAT_NO") + p99_lat_disks_usec=$(bc "$p99_lat_disks_usec / $REPEAT_NO") + p99_99_lat_disks_usec=$(bc "$p99_99_lat_disks_usec / $REPEAT_NO") + stdev_disks_usec=$(bc "$stdev_disks_usec / $REPEAT_NO") + mean_slat_disks_usec=$(bc "$mean_slat_disks_usec / $REPEAT_NO") + mean_clat_disks_usec=$(bc "$mean_clat_disks_usec / $REPEAT_NO") +elif [[ "$PLUGIN" == "spdk-perf-nvme" ]]; then + mean_lat_disks_usec=$(bc "$mean_lat_disks_usec/$REPEAT_NO") +fi + +printf "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" ${DISKNO} ${iops_disks} ${mean_lat_disks_usec} ${p90_lat_disks_usec} ${p99_lat_disks_usec} \ + ${p99_99_lat_disks_usec} ${stdev_disks_usec} ${mean_slat_disks_usec} ${mean_clat_disks_usec} ${bw} >> $result_file + +if [[ -n "$CPUFREQ" ]]; then + cpupower frequency-set -g $cpu_governor +fi + if [ $PLUGIN = "kernel-io-uring" ]; then # Reload the nvme driver so that other test runs are not affected modprobe -rv nvme @@ -227,4 +381,4 @@ if [ $PLUGIN = "kernel-io-uring" ]; then cat $backup_dir/$disk/io_poll_delay > $sysfs/io_poll_delay done fi -rm -f $BASE_DIR/bdev.conf $BASE_DIR/config.fio +rm -f $testdir/bdev.conf $testdir/config.fio diff --git a/test/nvme/spdk_nvme_cli.sh b/test/nvme/spdk_nvme_cli.sh index eea1a8a2058..516a16f486e 100755 --- a/test/nvme/spdk_nvme_cli.sh +++ b/test/nvme/spdk_nvme_cli.sh @@ -5,29 +5,18 @@ rootdir=$(readlink -f $testdir/../..) source $rootdir/scripts/common.sh source $rootdir/test/common/autotest_common.sh -if [ -z "${DEPENDENCY_DIR}" ]; then - echo DEPENDENCY_DIR not defined! +if [[ $(uname) != "Linux" ]]; then + echo "NVMe cuse tests only supported on Linux" exit 1 fi -spdk_nvme_cli="${DEPENDENCY_DIR}/nvme-cli" +nvme_cli_build -if [ ! -d $spdk_nvme_cli ]; then - echo "nvme-cli repository not found at $spdk_nvme_cli; skipping tests." - exit 1 -fi - -if [ $(uname) = Linux ]; then - trap "kill_stub; exit 1" SIGINT SIGTERM EXIT - start_stub "-s 2048 -i 0 -m 0xF" -fi +trap "kill_stub; exit 1" SIGINT SIGTERM EXIT +start_stub "-s 2048 -i 0 -m 0xF" -# Build against the version of SPDK under test -rm -f "$spdk_nvme_cli/spdk" -ln -sf "$rootdir" "$spdk_nvme_cli/spdk" +pushd ${DEPENDENCY_DIR}/nvme-cli -cd $spdk_nvme_cli -make clean && make -j$(nproc) LDFLAGS="$(make -s -C $spdk_nvme_cli/spdk ldflags)" sed -i 's/spdk=0/spdk=1/g' spdk.conf sed -i 's/shm_id=.*/shm_id=0/g' spdk.conf for bdf in $(get_nvme_bdfs); do @@ -44,7 +33,8 @@ for bdf in $(get_nvme_bdfs); do ./nvme get-log $bdf -i 1 -l 100 ./nvme reset $bdf done -if [ $(uname) = Linux ]; then - trap - SIGINT SIGTERM EXIT - kill_stub -fi + +popd + +trap - SIGINT SIGTERM EXIT +kill_stub diff --git a/test/nvmf/common.sh b/test/nvmf/common.sh index a388093c325..79f9bc85097 100644 --- a/test/nvmf/common.sh +++ b/test/nvmf/common.sh @@ -1,4 +1,6 @@ NVMF_PORT=4420 +NVMF_SECOND_PORT=4421 +NVMF_THIRD_PORT=4422 NVMF_IP_PREFIX="192.168.100" NVMF_IP_LEAST_ADDR=8 NVMF_TCP_IP_ADDRESS="127.0.0.1" @@ -41,6 +43,7 @@ function load_ib_rdma_modules() { } function detect_soft_roce_nics() { + rxe_cfg stop # make sure we run tests with a clean slate rxe_cfg start } @@ -222,6 +225,9 @@ function revert_soft_roce() { } function check_ip_is_soft_roce() { + if [ "$TEST_TRANSPORT" != "rdma" ]; then + return 0 + fi rxe_cfg status rxe | grep -wq "$1" } diff --git a/test/nvmf/host/bdevperf.sh b/test/nvmf/host/bdevperf.sh index 776550c4dc9..1a544a00213 100755 --- a/test/nvmf/host/bdevperf.sh +++ b/test/nvmf/host/bdevperf.sh @@ -21,13 +21,6 @@ function tgt_init() { } nvmftestinit -# There is an intermittent error relating to this test and Soft-RoCE. for now, just -# skip this test if we are using rxe. TODO: get to the bottom of GitHub issue #1165 -if [ $TEST_TRANSPORT == "rdma" ] && check_ip_is_soft_roce $NVMF_FIRST_TARGET_IP; then - echo "Using software RDMA, skipping the host bdevperf tests." - exit 0 -fi - tgt_init "$rootdir/test/bdev/bdevperf/bdevperf" --json <(gen_nvmf_target_json) -q 128 -o 4096 -w verify -t 1 diff --git a/test/nvmf/host/multicontroller.sh b/test/nvmf/host/multicontroller.sh new file mode 100755 index 00000000000..8d848ee902d --- /dev/null +++ b/test/nvmf/host/multicontroller.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +testdir=$(readlink -f $(dirname $0)) +rootdir=$(readlink -f $testdir/../../..) +source $rootdir/test/common/autotest_common.sh +source $rootdir/test/nvmf/common.sh + +rpc_py="$rootdir/scripts/rpc.py" + +MALLOC_BDEV_SIZE=64 +MALLOC_BLOCK_SIZE=512 +NVMF_HOST_FIRST_PORT="60000" +NVMF_HOST_SECOND_PORT="60001" + +bdevperf_rpc_sock=/var/tmp/bdevperf.sock + +if [ "$TEST_TRANSPORT" == "rdma" ]; then + echo "Skipping tests on RDMA because the rdma stack fails to configure the same IP for host and target." + exit 0 +fi + +nvmftestinit + +nvmfappstart -m 0xF + +$rpc_py nvmf_create_transport $NVMF_TRANSPORT_OPTS -u 8192 +$rpc_py bdev_malloc_create $MALLOC_BDEV_SIZE $MALLOC_BLOCK_SIZE -b Malloc0 +$rpc_py nvmf_create_subsystem nqn.2016-06.io.spdk:cnode1 -a -s SPDK00000000000001 +$rpc_py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 Malloc0 +$rpc_py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_PORT +$rpc_py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_SECOND_PORT + +$rootdir/test/bdev/bdevperf/bdevperf -z -r $bdevperf_rpc_sock -q 128 -o 4096 -w write -t 1 -f &> $testdir/try.txt & +bdevperf_pid=$! + +trap 'process_shm --id $NVMF_APP_SHM_ID; pap "$testdir/try.txt"; killprocess $bdevperf_pid; nvmftestfini; exit 1' SIGINT SIGTERM EXIT +waitforlisten $bdevperf_pid $bdevperf_rpc_sock + +# Create a controller from the first IP/Port combination. +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b NVMe0 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP \ + -s $NVMF_PORT -f ipv4 -n nqn.2016-06.io.spdk:cnode1 -i $NVMF_FIRST_TARGET_IP -c $NVMF_HOST_FIRST_PORT + +# wait for the first controller to show up. +while ! $rpc_py -s $bdevperf_rpc_sock bdev_nvme_get_controllers | grep -c NVMe; do + ((++bdev_nvme_get_controllers_timeout <= 10)) + sleep 1s +done + +# try to attach to the second port with a different hostsvcid (this should fail). +NOT $rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b NVMe0 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP \ + -s $NVMF_SECOND_PORT -f ipv4 -n nqn.2016-06.io.spdk:cnode1 -i $NVMF_FIRST_TARGET_IP -c $NVMF_HOST_SECOND_PORT + +# Add a second path without specifying the host information. Should pass. +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b NVMe0 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP \ + -s $NVMF_SECOND_PORT -f ipv4 -n nqn.2016-06.io.spdk:cnode1 + +# Add a second controller by attaching to the same subsystem from a different hostid. +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b NVMe1 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP \ + -s $NVMF_SECOND_PORT -f ipv4 -n nqn.2016-06.io.spdk:cnode1 -i $NVMF_FIRST_TARGET_IP -c $NVMF_HOST_SECOND_PORT + +if [ "$($rpc_py -s $bdevperf_rpc_sock bdev_nvme_get_controllers | grep -c NVMe)" != "2" ]; then + echo "actual number of controllers is not equal to expected count." + exit 1 +fi + +$rootdir/test/bdev/bdevperf/bdevperf.py -s $bdevperf_rpc_sock perform_tests + +killprocess $bdevperf_pid + +$rpc_py nvmf_delete_subsystem nqn.2016-06.io.spdk:cnode1 + +trap - SIGINT SIGTERM EXIT + +pap "$testdir/try.txt" +nvmftestfini diff --git a/test/nvmf/host/multipath.sh b/test/nvmf/host/multipath.sh new file mode 100755 index 00000000000..4acd9e599cb --- /dev/null +++ b/test/nvmf/host/multipath.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash + +testdir=$(readlink -f $(dirname $0)) +rootdir=$(readlink -f $testdir/../../..) +source $rootdir/test/common/autotest_common.sh +source $rootdir/test/nvmf/common.sh + +MALLOC_BDEV_SIZE=64 +MALLOC_BLOCK_SIZE=512 + +rpc_py="$rootdir/scripts/rpc.py" + +bdevperf_rpc_sock=/var/tmp/bdevperf.sock + +nvmftestinit + +# This issue brings up a weird error in soft roce where the RDMA WC doesn't point to the correct qpair. +if check_ip_is_soft_roce $NVMF_FIRST_TARGET_IP && [ "$TEST_TRANSPORT" == "rdma" ]; then + echo "Using software RDMA, not running this test due to a known issue." + exit 0 +fi + +nvmfappstart -m 0xF + +$rpc_py nvmf_create_transport $NVMF_TRANSPORT_OPTS -u 8192 +$rpc_py bdev_malloc_create $MALLOC_BDEV_SIZE $MALLOC_BLOCK_SIZE -b Malloc0 +$rpc_py nvmf_create_subsystem nqn.2016-06.io.spdk:cnode1 -a -s SPDK00000000000001 +$rpc_py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 Malloc0 +$rpc_py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_PORT +$rpc_py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_SECOND_PORT +$rpc_py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_THIRD_PORT + +$rootdir/test/bdev/bdevperf/bdevperf -z -r $bdevperf_rpc_sock -q 128 -o 4096 -w verify -t 10 -f &> $testdir/try.txt & +bdevperf_pid=$! + +trap 'process_shm --id $NVMF_APP_SHM_ID; rm -f $testdir/try.txt; killprocess $bdevperf_pid; nvmftestfini; exit 1' SIGINT SIGTERM EXIT +waitforlisten $bdevperf_pid $bdevperf_rpc_sock +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b NVMe0 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_PORT -f ipv4 -n nqn.2016-06.io.spdk:cnode1 +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b NVMe0 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_SECOND_PORT -f ipv4 -n nqn.2016-06.io.spdk:cnode1 + +$rootdir/test/bdev/bdevperf/bdevperf.py -s $bdevperf_rpc_sock perform_tests & +rpc_pid=$! + +sleep 1 + +$rpc_py nvmf_subsystem_remove_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_PORT + +sleep 3 + +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b NVMe0 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_THIRD_PORT -f ipv4 -n nqn.2016-06.io.spdk:cnode1 +$rpc_py nvmf_subsystem_remove_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_SECOND_PORT + +sleep 3 + +# Give the admin qpair time to fail before we add the new listener in. This prevents us from trying to connect to the wrong trid. +$rpc_py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_PORT + +sleep 1 + +$rpc_py nvmf_subsystem_remove_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_THIRD_PORT + +wait $rpc_pid + +killprocess $bdevperf_pid + +cat $testdir/try.txt +# if this test fails it means we didn't fail over to the second +count="$(grep -c "Resetting controller successful" < $testdir/try.txt)" + +if ((count != 3)); then + false +fi + +# Part 2 of the test. Start removing ports, starting with the one we are connected to, confirm that the ctrlr remains active until the final trid is removed. +$rootdir/test/bdev/bdevperf/bdevperf -z -r $bdevperf_rpc_sock -q 128 -o 4096 -w verify -t 1 -f &> $testdir/try.txt & +bdevperf_pid=$! + +waitforlisten $bdevperf_pid $bdevperf_rpc_sock +$rpc_py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_SECOND_PORT +$rpc_py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_THIRD_PORT +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b NVMe0 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_PORT -f ipv4 -n nqn.2016-06.io.spdk:cnode1 +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b NVMe0 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_SECOND_PORT -f ipv4 -n nqn.2016-06.io.spdk:cnode1 +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b NVMe0 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_THIRD_PORT -f ipv4 -n nqn.2016-06.io.spdk:cnode1 + +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_get_controllers | grep -q NVMe0 + +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_detach_controller NVMe0 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_PORT -f ipv4 -n nqn.2016-06.io.spdk:cnode1 + +# Async operation since we need to reconnect with new TRID. +sleep 3 +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_get_controllers | grep -q NVMe0 +$rootdir/test/bdev/bdevperf/bdevperf.py -s $bdevperf_rpc_sock perform_tests & +rpc_pid=$! + +wait $rpc_pid + +# No need to wait here since we are deleting a TRID we aren't connected to. +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_detach_controller NVMe0 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_THIRD_PORT -f ipv4 -n nqn.2016-06.io.spdk:cnode1 +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_get_controllers | grep -q NVMe0 +$rpc_py -s $bdevperf_rpc_sock bdev_nvme_detach_controller NVMe0 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_SECOND_PORT -f ipv4 -n nqn.2016-06.io.spdk:cnode1 +sleep 3 + +if $rpc_py -s $bdevperf_rpc_sock bdev_nvme_get_controllers | grep -q NVMe0; then + echo "Controller was not properly removed." + false +fi + +killprocess $bdevperf_pid + +sync +$rpc_py nvmf_delete_subsystem nqn.2016-06.io.spdk:cnode1 + +trap - SIGINT SIGTERM EXIT + +rm -f $testdir/try.txt +nvmftestfini diff --git a/test/nvmf/host/perf.sh b/test/nvmf/host/perf.sh index 69fa28f0b6e..4d70bd83dcc 100755 --- a/test/nvmf/host/perf.sh +++ b/test/nvmf/host/perf.sh @@ -32,7 +32,7 @@ $rpc_py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPOR # Test multi-process access to local NVMe device if [ -n "$local_nvme_trid" ]; then if [ $SPDK_RUN_NON_ROOT -eq 1 ]; then - perf_app="sudo -u $(logname) $SPDK_EXAMPLE_DIR/perf" + perf_app="sudo -u $USER $SPDK_EXAMPLE_DIR/perf" else perf_app="$SPDK_EXAMPLE_DIR/perf" fi diff --git a/test/nvmf/host/target_disconnect.sh b/test/nvmf/host/target_disconnect.sh index 82521196b3c..22c077afbe9 100755 --- a/test/nvmf/host/target_disconnect.sh +++ b/test/nvmf/host/target_disconnect.sh @@ -73,16 +73,10 @@ function nvmf_target_disconnect_tc3() { } nvmftestinit -# There is an intermittent error relating to this test and Soft-RoCE. for now, just -# skip this test if we are using rxe. TODO: get to the bottom of GitHub issue #1043 -if [ $TEST_TRANSPORT == "rdma" ] && check_ip_is_soft_roce $NVMF_FIRST_TARGET_IP; then - echo "Using software RDMA, skipping the target disconnect tests." -else - run_test "nvmf_target_disconnect_tc1" nvmf_target_disconnect_tc1 - run_test "nvmf_target_disconnect_tc2" nvmf_target_disconnect_tc2 - if [ -n "$NVMF_SECOND_TARGET_IP" ]; then - run_test "nvmf_target_disconnect_tc3" nvmf_target_disconnect_tc3 - fi +run_test "nvmf_target_disconnect_tc1" nvmf_target_disconnect_tc1 +run_test "nvmf_target_disconnect_tc2" nvmf_target_disconnect_tc2 +if [ -n "$NVMF_SECOND_TARGET_IP" ]; then + run_test "nvmf_target_disconnect_tc3" nvmf_target_disconnect_tc3 fi trap - SIGINT SIGTERM EXIT diff --git a/test/nvmf/nvmf.sh b/test/nvmf/nvmf.sh index dc15788fffc..188ded5a385 100755 --- a/test/nvmf/nvmf.sh +++ b/test/nvmf/nvmf.sh @@ -21,8 +21,6 @@ if [ $SPDK_TEST_NVME_CLI -eq 1 ]; then run_test "nvmf_nvme_cli" test/nvmf/target/nvme_cli.sh "${TEST_ARGS[@]}" fi run_test "nvmf_lvol" test/nvmf/target/nvmf_lvol.sh "${TEST_ARGS[@]}" -#TODO: disabled due to intermittent failures. Need to triage. -# run_test "nvmf_srq_overwhelm" test/nvmf/target/srq_overwhelm.sh $TEST_ARGS run_test "nvmf_vhost" test/nvmf/target/nvmf_vhost.sh "${TEST_ARGS[@]}" run_test "nvmf_bdev_io_wait" test/nvmf/target/bdev_io_wait.sh "${TEST_ARGS[@]}" run_test "nvmf_create_transport." test/nvmf/target/create_transport.sh "${TEST_ARGS[@]}" @@ -37,21 +35,38 @@ fi run_test "nvmf_nmic" test/nvmf/target/nmic.sh "${TEST_ARGS[@]}" run_test "nvmf_rpc" test/nvmf/target/rpc.sh "${TEST_ARGS[@]}" run_test "nvmf_fio" test/nvmf/target/fio.sh "${TEST_ARGS[@]}" -run_test "nvmf_shutdown" test/nvmf/target/shutdown.sh "${TEST_ARGS[@]}" run_test "nvmf_bdevio" test/nvmf/target/bdevio.sh "${TEST_ARGS[@]}" run_test "nvmf_invalid" test/nvmf/target/invalid.sh "${TEST_ARGS[@]}" +run_test "nvmf_abort" test/nvmf/target/abort.sh "${TEST_ARGS[@]}" + +if ! check_ip_is_soft_roce $NVMF_FIRST_TARGET_IP; then + # Soft-RoCE will return invalid values in the WC field after a qp has been + # destroyed which lead to NULL pointer references not seen in real hardware. + run_test "nvmf_shutdown" test/nvmf/target/shutdown.sh "${TEST_ARGS[@]}" + #TODO: disabled due to intermittent failures. Need to triage. + # run_test "nvmf_srq_overwhelm" test/nvmf/target/srq_overwhelm.sh $TEST_ARGS +fi timing_enter host -run_test "nvmf_bdevperf" test/nvmf/host/bdevperf.sh "${TEST_ARGS[@]}" run_test "nvmf_identify" test/nvmf/host/identify.sh "${TEST_ARGS[@]}" run_test "nvmf_perf" test/nvmf/host/perf.sh "${TEST_ARGS[@]}" +run_test "nvmf_multipath" test/nvmf/host/multipath.sh "${TEST_ARGS[@]}" +run_test "nvmf_multicontroller" test/nvmf/host/multicontroller.sh "${TEST_ARGS[@]}" # TODO: disabled due to intermittent failures (RDMA_CM_EVENT_UNREACHABLE/ETIMEDOUT) #run_test test/nvmf/host/identify_kernel_nvmf.sh $TEST_ARGS run_test "nvmf_aer" test/nvmf/host/aer.sh "${TEST_ARGS[@]}" run_test "nvmf_fio" test/nvmf/host/fio.sh "${TEST_ARGS[@]}" -run_test "nvmf_target_disconnect" test/nvmf/host/target_disconnect.sh "${TEST_ARGS[@]}" + +# There is an intermittent error relating to those tests and Soft-RoCE. +# Skip those tests if we are using rxe. +if ! check_ip_is_soft_roce $NVMF_FIRST_TARGET_IP; then + # GitHub issue #1165 + run_test "nvmf_bdevperf" test/nvmf/host/bdevperf.sh "${TEST_ARGS[@]}" + # GitHub issue #1043 + run_test "nvmf_target_disconnect" test/nvmf/host/target_disconnect.sh "${TEST_ARGS[@]}" +fi timing_exit host diff --git a/test/nvmf/target/abort.sh b/test/nvmf/target/abort.sh new file mode 100755 index 00000000000..913c17e19d4 --- /dev/null +++ b/test/nvmf/target/abort.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +testdir=$(readlink -f $(dirname $0)) +rootdir=$(readlink -f $testdir/../../..) +source $rootdir/test/common/autotest_common.sh +source $rootdir/test/nvmf/common.sh + +MALLOC_BDEV_SIZE=64 +MALLOC_BLOCK_SIZE=4096 + +rpc_py="$rootdir/scripts/rpc.py" + +nvmftestinit +nvmfappstart -m 0xE + +$rpc_py nvmf_create_transport $NVMF_TRANSPORT_OPTS -u 8192 + +# Construct a delay bdev on a malloc bdev which has constant 10ms delay for all read or write I/Os +$rpc_py bdev_malloc_create $MALLOC_BDEV_SIZE $MALLOC_BLOCK_SIZE -b Malloc0 +$rpc_py bdev_delay_create -b Malloc0 -d Delay0 -r 1000000 -t 1000000 -w 1000000 -n 1000000 + +# Create an NVMe-oF subsystem and add the delay bdev as a namespace +$rpc_py nvmf_create_subsystem nqn.2016-06.io.spdk:cnode0 -a -s SPDK0 +$rpc_py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode0 Delay0 +$rpc_py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode0 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s $NVMF_PORT + +# Run abort application +$SPDK_EXAMPLE_DIR/abort -r "trtype:$TEST_TRANSPORT adrfam:IPv4 traddr:$NVMF_FIRST_TARGET_IP trsvcid:$NVMF_PORT" -c 0x1 + +# Clean up +$rpc_py nvmf_delete_subsystem nqn.2016-06.io.spdk:cnode0 + +trap - SIGINT SIGTERM EXIT + +nvmftestfini diff --git a/test/nvmf/target/nmic.sh b/test/nvmf/target/nmic.sh index 6a967dc081e..f8501343d28 100755 --- a/test/nvmf/target/nmic.sh +++ b/test/nvmf/target/nmic.sh @@ -13,8 +13,6 @@ rpc_py="$rootdir/scripts/rpc.py" nvmftestinit nvmfappstart -m 0xF -NVMF_SECOND_TARGET_IP=$(echo "$RDMA_IP_LIST" | sed -n 2p) - $rpc_py nvmf_create_transport $NVMF_TRANSPORT_OPTS -u 8192 # Create subsystems @@ -38,18 +36,15 @@ else fi echo "test case2: host connect to nvmf target in multiple paths" -if [ -n "$NVMF_SECOND_TARGET_IP" ]; then - $rpc_py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPORT -a $NVMF_SECOND_TARGET_IP -s $NVMF_PORT - - nvme connect -t $TEST_TRANSPORT -n "nqn.2016-06.io.spdk:cnode1" -a "$NVMF_FIRST_TARGET_IP" -s "$NVMF_PORT" - nvme connect -t $TEST_TRANSPORT -n "nqn.2016-06.io.spdk:cnode1" -a "$NVMF_SECOND_TARGET_IP" -s "$NVMF_PORT" +$rpc_py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s "$NVMF_SECOND_PORT" +nvme connect -t $TEST_TRANSPORT -n "nqn.2016-06.io.spdk:cnode1" -a "$NVMF_FIRST_TARGET_IP" -s "$NVMF_PORT" +nvme connect -t $TEST_TRANSPORT -n "nqn.2016-06.io.spdk:cnode1" -a "$NVMF_FIRST_TARGET_IP" -s "$NVMF_SECOND_PORT" - waitforserial "$NVMF_SERIAL" +waitforserial "$NVMF_SERIAL" - $rootdir/scripts/fio.py -p nvmf -i 4096 -d 1 -t write -r 1 -v -fi +$rootdir/scripts/fio.py -p nvmf -i 4096 -d 1 -t write -r 1 -v -nvme disconnect -n "nqn.2016-06.io.spdk:cnode1" || true +nvme disconnect -n "nqn.2016-06.io.spdk:cnode1" trap - SIGINT SIGTERM EXIT diff --git a/test/nvmf/target/nvme_cli.sh b/test/nvmf/target/nvme_cli.sh index 7cef321790c..29359689bae 100755 --- a/test/nvmf/target/nvme_cli.sh +++ b/test/nvmf/target/nvme_cli.sh @@ -10,8 +10,6 @@ if [ -z "${DEPENDENCY_DIR}" ]; then exit 1 fi -spdk_nvme_cli="${DEPENDENCY_DIR}/nvme-cli" - MALLOC_BDEV_SIZE=64 MALLOC_BLOCK_SIZE=512 @@ -54,21 +52,22 @@ done nvme disconnect -n "nqn.2016-06.io.spdk:cnode1" -if [ -d $spdk_nvme_cli ]; then - # Test spdk/nvme-cli NVMe-oF commands: discover, connect and disconnect - cd $spdk_nvme_cli - sed -i 's/shm_id=.*/shm_id=-1/g' spdk.conf - ./nvme discover -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s "$NVMF_PORT" - nvme_num_before_connection=$(get_nvme_devs 2>&1 || echo 0) - ./nvme connect -t $TEST_TRANSPORT -n "nqn.2016-06.io.spdk:cnode1" -a "$NVMF_FIRST_TARGET_IP" -s "$NVMF_PORT" - sleep 1 - nvme_num=$(get_nvme_devs 2>&1) - ./nvme disconnect -n "nqn.2016-06.io.spdk:cnode1" - if [ $nvme_num -le $nvme_num_before_connection ]; then - echo "spdk/nvme-cli connect target devices failed" - exit 1 - fi +# Test spdk/nvme-cli NVMe-oF commands: discover, connect and disconnect +nvme_cli_build +pushd "${DEPENDENCY_DIR}/nvme-cli" + +sed -i 's/shm_id=.*/shm_id=-1/g' spdk.conf +./nvme discover -t $TEST_TRANSPORT -a $NVMF_FIRST_TARGET_IP -s "$NVMF_PORT" +nvme_num_before_connection=$(get_nvme_devs 2>&1 || echo 0) +./nvme connect -t $TEST_TRANSPORT -n "nqn.2016-06.io.spdk:cnode1" -a "$NVMF_FIRST_TARGET_IP" -s "$NVMF_PORT" +sleep 1 +nvme_num=$(get_nvme_devs 2>&1) +./nvme disconnect -n "nqn.2016-06.io.spdk:cnode1" +if [ $nvme_num -le $nvme_num_before_connection ]; then + echo "spdk/nvme-cli connect target devices failed" + exit 1 fi +popd $rpc_py nvmf_delete_subsystem nqn.2016-06.io.spdk:cnode1 trap - SIGINT SIGTERM EXIT diff --git a/test/nvmf/target/nvmf_example.sh b/test/nvmf/target/nvmf_example.sh index 28045bc49f3..e1ab28f6247 100755 --- a/test/nvmf/target/nvmf_example.sh +++ b/test/nvmf/target/nvmf_example.sh @@ -7,22 +7,25 @@ source $rootdir/test/nvmf/common.sh rpc_py="$rootdir/scripts/rpc.py" +NVMF_EXAMPLE=("$SPDK_EXAMPLE_DIR/nvmf") + MALLOC_BDEV_SIZE=64 MALLOC_BLOCK_SIZE=512 function build_nvmf_example_args() { if [ $SPDK_RUN_NON_ROOT -eq 1 ]; then - echo "sudo -u $(logname) $SPDK_EXAMPLE_DIR/nvmf -i $NVMF_APP_SHM_ID" -g 10000 + NVMF_EXAMPLE=(sudo -u "$USER" "${NVMF_EXAMPLE[@]}") + NVMF_EXAMPLE+=(-i "$NVMF_APP_SHM_ID" -g 10000) else - echo "$SPDK_EXAMPLE_DIR/nvmf -i $NVMF_APP_SHM_ID" -g 10000 + NVMF_EXAMPLE+=(-i "$NVMF_APP_SHM_ID" -g 10000) fi } -NVMF_EXAMPLE="$(build_nvmf_example_args)" +build_nvmf_example_args function nvmfexamplestart() { timing_enter start_nvmf_example - $NVMF_EXAMPLE $1 & + "${NVMF_EXAMPLE[@]}" $1 & nvmfpid=$! trap 'process_shm --id $NVMF_APP_SHM_ID; nvmftestfini; exit 1' SIGINT SIGTERM EXIT waitforlisten $nvmfpid diff --git a/test/nvmf/target/srq_overwhelm.sh b/test/nvmf/target/srq_overwhelm.sh index fe4dd7d2909..98af97aab3e 100755 --- a/test/nvmf/target/srq_overwhelm.sh +++ b/test/nvmf/target/srq_overwhelm.sh @@ -12,11 +12,6 @@ rpc_py="$rootdir/scripts/rpc.py" nvmftestinit -if check_ip_is_soft_roce $NVMF_FIRST_TARGET_IP; then - echo "Using software RDMA, Likely not enough memory to run this test. aborting." - exit 0 -fi - nvmfappstart -m 0xF # create the rdma transport with an intentionally small SRQ depth diff --git a/test/ocf/common.sh b/test/ocf/common.sh index 6c196ab9705..89709079b6f 100644 --- a/test/ocf/common.sh +++ b/test/ocf/common.sh @@ -15,7 +15,6 @@ function clear_nvme() { # Clear metadata on NVMe device $rootdir/scripts/setup.sh reset - sleep 5 name=$(get_nvme_name_from_bdf "${bdf[0]}") mountpoints=$(lsblk /dev/$name --output MOUNTPOINT -n | wc -w) diff --git a/test/ocf/management/create-destruct.sh b/test/ocf/management/create-destruct.sh index 162f7a67926..c1fd66b360c 100755 --- a/test/ocf/management/create-destruct.sh +++ b/test/ocf/management/create-destruct.sh @@ -43,7 +43,7 @@ if bdev_check_claimed Malloc0; then exit 1 fi -$rpc_py bdev_ocf_create FullCache wt Malloc0 Malloc1 +$rpc_py bdev_ocf_create FullCache wt Malloc0 Malloc1 --cache-line-size 8 $rpc_py bdev_ocf_get_bdevs FullCache | jq -e \ '.[0] | .started and .cache.attached and .core.attached' @@ -59,7 +59,7 @@ if bdev_check_claimed Malloc0 && bdev_check_claimed Malloc1; then exit 1 fi -$rpc_py bdev_ocf_create HotCache wt Malloc0 Malloc1 +$rpc_py bdev_ocf_create HotCache wt Malloc0 Malloc1 --cache-line-size 16 if ! (bdev_check_claimed Malloc0 && bdev_check_claimed Malloc1); then echo >&2 "Base devices expected to be claimed now" diff --git a/test/spdkcli/match_files/spdkcli_details_lvs.test.match b/test/spdkcli/match_files/spdkcli_details_lvs.test.match index acbb23b1af0..efb34b9e793 100644 --- a/test/spdkcli/match_files/spdkcli_details_lvs.test.match +++ b/test/spdkcli/match_files/spdkcli_details_lvs.test.match @@ -1,6 +1,6 @@ { "base_bdev": "Malloc0", - "block_size": 4096, + "block_size": 512, "cluster_size": 4194304, "free_clusters": 1, "name": "lvs0", diff --git a/test/spdkcli/match_files/spdkcli_iscsi.test.match b/test/spdkcli/match_files/spdkcli_iscsi.test.match index ffb31492128..daa1ec8b998 100644 --- a/test/spdkcli/match_files/spdkcli_iscsi.test.match +++ b/test/spdkcli/match_files/spdkcli_iscsi.test.match @@ -16,7 +16,9 @@ o- iscsi ....................................................................... | o- first_burst_length: 8192 .............................................................................................. [...] | o- immediate_data: True .................................................................................................. [...] | o- max_connections_per_session: 2 ........................................................................................ [...] + | o- max_large_datain_per_connection: 64 ................................................................................... [...] | o- max_queue_depth: 64 ................................................................................................... [...] + | o- max_r2t_per_connection: 4 ............................................................................................. [...] | o- max_sessions: 128 ..................................................................................................... [...] | o- mutual_chap: False .................................................................................................... [...] | o- node_base: iqn.2016-06.io.spdk ........................................................................................ [...] diff --git a/test/unit/lib/bdev/bdev_ocssd.c/bdev_ocssd_ut.c b/test/unit/lib/bdev/bdev_ocssd.c/bdev_ocssd_ut.c index d039a3d98f7..59a99aa0814 100644 --- a/test/unit/lib/bdev/bdev_ocssd.c/bdev_ocssd_ut.c +++ b/test/unit/lib/bdev/bdev_ocssd.c/bdev_ocssd_ut.c @@ -198,6 +198,7 @@ create_nvme_bdev_controller(const struct spdk_nvme_transport_id *trid, const cha { struct spdk_nvme_ctrlr *ctrlr; struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct nvme_bdev_ctrlr_trid *trid_entry; uint32_t nsid; ctrlr = find_controller(trid); @@ -211,12 +212,15 @@ create_nvme_bdev_controller(const struct spdk_nvme_transport_id *trid, const cha nvme_bdev_ctrlr->namespaces = calloc(ctrlr->ns_count, sizeof(struct nvme_bdev_ns *)); SPDK_CU_ASSERT_FATAL(nvme_bdev_ctrlr->namespaces != NULL); + trid_entry = calloc(1, sizeof(struct nvme_bdev_ctrlr_trid)); + SPDK_CU_ASSERT_FATAL(trid_entry != NULL); + trid_entry->trid = *trid; + nvme_bdev_ctrlr->ctrlr = ctrlr; nvme_bdev_ctrlr->num_ns = ctrlr->ns_count; nvme_bdev_ctrlr->ref = 0; - nvme_bdev_ctrlr->trid = *trid; + nvme_bdev_ctrlr->connected_trid = &trid_entry->trid; nvme_bdev_ctrlr->name = strdup(name); - for (nsid = 0; nsid < ctrlr->ns_count; ++nsid) { nvme_bdev_ctrlr->namespaces[nsid] = calloc(1, sizeof(struct nvme_bdev_ns)); SPDK_CU_ASSERT_FATAL(nvme_bdev_ctrlr->namespaces[nsid] != NULL); @@ -236,6 +240,9 @@ create_nvme_bdev_controller(const struct spdk_nvme_transport_id *trid, const cha TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq); + TAILQ_INIT(&nvme_bdev_ctrlr->trids); + TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link); + return nvme_bdev_ctrlr; } diff --git a/test/unit/lib/bdev/compress.c/compress_ut.c b/test/unit/lib/bdev/compress.c/compress_ut.c index c0abb28083f..53c14310ca7 100644 --- a/test/unit/lib/bdev/compress.c/compress_ut.c +++ b/test/unit/lib/bdev/compress.c/compress_ut.c @@ -923,7 +923,7 @@ test_poller(void) CU_ASSERT(TAILQ_EMPTY(&g_comp_bdev.queued_comp_ops) == true); rc = comp_dev_poller((void *)&g_comp_bdev); CU_ASSERT(TAILQ_EMPTY(&g_comp_bdev.queued_comp_ops) == true); - CU_ASSERT(rc == 0); + CU_ASSERT(rc == SPDK_POLLER_BUSY); /* Success from dequeue, 2 ops. nothing needing to be resubmitted. */ @@ -942,7 +942,7 @@ test_poller(void) CU_ASSERT(TAILQ_EMPTY(&g_comp_bdev.queued_comp_ops) == true); rc = comp_dev_poller((void *)&g_comp_bdev); CU_ASSERT(TAILQ_EMPTY(&g_comp_bdev.queued_comp_ops) == true); - CU_ASSERT(rc == 0); + CU_ASSERT(rc == SPDK_POLLER_BUSY); /* Success from dequeue, one op to be resubmitted. */ @@ -970,7 +970,7 @@ test_poller(void) CU_ASSERT(TAILQ_EMPTY(&g_comp_bdev.queued_comp_ops) == false); rc = comp_dev_poller((void *)&g_comp_bdev); CU_ASSERT(TAILQ_EMPTY(&g_comp_bdev.queued_comp_ops) == true); - CU_ASSERT(rc == 0); + CU_ASSERT(rc == SPDK_POLLER_BUSY); /* op_to_queue is freed in code under test */ free(cb_args); diff --git a/test/unit/lib/blob/Makefile b/test/unit/lib/blob/Makefile index 019f966d0a8..a039a423eb2 100644 --- a/test/unit/lib/blob/Makefile +++ b/test/unit/lib/blob/Makefile @@ -34,7 +34,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk -CUNIT_VERSION = $(shell sed -n -e 's/.*VERSION "\([0-9\.\-]*\).*/\1/p' /usr/include/CUnit/CUnit.h) +CUNIT_VERSION = $(shell echo "\#include " | $(CC) -E -dM - | sed -n -e 's/.*VERSION "\([0-9\.\-]*\).*/\1/p') ifeq ($(CUNIT_VERSION),2.1-3) DIRS-y = blob.c else diff --git a/test/unit/lib/blob/blob.c/blob_ut.c b/test/unit/lib/blob/blob.c/blob_ut.c index 6e51842e359..21a39543c41 100644 --- a/test/unit/lib/blob/blob.c/blob_ut.c +++ b/test/unit/lib/blob/blob.c/blob_ut.c @@ -47,8 +47,8 @@ struct spdk_blob_store *g_bs; spdk_blob_id g_blobid; -struct spdk_blob *g_blob; -int g_bserrno; +struct spdk_blob *g_blob, *g_blob2; +int g_bserrno, g_bserrno2; struct spdk_xattr_names *g_names; int g_done; char *g_xattr_names[] = {"first", "second", "third"}; @@ -170,6 +170,18 @@ blob_op_with_handle_complete(void *cb_arg, struct spdk_blob *blb, int bserrno) g_bserrno = bserrno; } +static void +blob_op_with_handle_complete2(void *cb_arg, struct spdk_blob *blob, int bserrno) +{ + if (g_blob == NULL) { + g_blob = blob; + g_bserrno = bserrno; + } else { + g_blob2 = blob; + g_bserrno2 = bserrno; + } +} + static void ut_bs_reload(struct spdk_blob_store **bs, struct spdk_bs_opts *opts) { @@ -322,8 +334,32 @@ blob_open(void) CU_ASSERT(g_bserrno == 0); CU_ASSERT(g_blob != NULL); blob = g_blob; + spdk_blob_close(blob, blob_op_complete, NULL); + poll_threads(); + CU_ASSERT(g_bserrno == 0); - ut_blob_close_and_delete(bs, blob); + /* Try to open file twice in succession. This should return the same + * blob object. + */ + g_blob = NULL; + g_blob2 = NULL; + g_bserrno = -1; + g_bserrno2 = -1; + spdk_bs_open_blob(bs, blobid, blob_op_with_handle_complete2, NULL); + spdk_bs_open_blob(bs, blobid, blob_op_with_handle_complete2, NULL); + poll_threads(); + CU_ASSERT(g_bserrno == 0); + CU_ASSERT(g_bserrno2 == 0); + CU_ASSERT(g_blob != NULL); + CU_ASSERT(g_blob2 != NULL); + CU_ASSERT(g_blob == g_blob2); + + g_bserrno = -1; + spdk_blob_close(g_blob, blob_op_complete, NULL); + poll_threads(); + CU_ASSERT(g_bserrno == 0); + + ut_blob_close_and_delete(bs, g_blob); } static void @@ -407,6 +443,36 @@ blob_create(void) CU_ASSERT(g_bserrno == -ENOSPC); } +/* + * Create and delete one blob in a loop over and over again. This helps ensure + * that the internal bit masks tracking used clusters and md_pages are being + * tracked correctly. + */ +static void +blob_create_loop(void) +{ + struct spdk_blob_store *bs = g_bs; + struct spdk_blob_opts opts; + uint32_t i, loop_count; + + loop_count = 4 * spdk_max(spdk_bit_array_capacity(bs->used_md_pages), + spdk_bit_pool_capacity(bs->used_clusters)); + + for (i = 0; i < loop_count; i++) { + ut_spdk_blob_opts_init(&opts); + opts.num_clusters = 1; + g_bserrno = -1; + g_blobid = SPDK_BLOBID_INVALID; + spdk_bs_create_blob_ext(bs, &opts, blob_op_with_id_complete, NULL); + poll_threads(); + CU_ASSERT(g_bserrno == 0); + CU_ASSERT(g_blobid != SPDK_BLOBID_INVALID); + spdk_bs_delete_blob(bs, g_blobid, blob_op_complete, NULL); + poll_threads(); + CU_ASSERT(g_bserrno == 0); + } +} + static void blob_create_fail(void) { @@ -5351,8 +5417,8 @@ blob_delete_snapshot_power_failure(void) CU_ASSERT(g_bserrno == 0); CU_ASSERT(g_blobid != SPDK_BLOBID_INVALID); snapshotid = g_blobid; - SPDK_CU_ASSERT_FATAL(spdk_bit_array_get(bs->used_clusters, 1)); - SPDK_CU_ASSERT_FATAL(!spdk_bit_array_get(bs->used_clusters, 11)); + SPDK_CU_ASSERT_FATAL(spdk_bit_pool_is_allocated(bs->used_clusters, 1)); + SPDK_CU_ASSERT_FATAL(!spdk_bit_pool_is_allocated(bs->used_clusters, 11)); dev_set_power_failure_thresholds(thresholds); @@ -5365,8 +5431,8 @@ blob_delete_snapshot_power_failure(void) dev_reset_power_failure_event(); ut_bs_dirty_load(&bs, NULL); - SPDK_CU_ASSERT_FATAL(spdk_bit_array_get(bs->used_clusters, 1)); - SPDK_CU_ASSERT_FATAL(!spdk_bit_array_get(bs->used_clusters, 11)); + SPDK_CU_ASSERT_FATAL(spdk_bit_pool_is_allocated(bs->used_clusters, 1)); + SPDK_CU_ASSERT_FATAL(!spdk_bit_pool_is_allocated(bs->used_clusters, 11)); spdk_bs_open_blob(bs, blobid, blob_op_with_handle_complete, NULL); poll_threads(); @@ -5452,8 +5518,8 @@ blob_create_snapshot_power_failure(void) CU_ASSERT(g_bserrno == 0); CU_ASSERT(g_blobid != SPDK_BLOBID_INVALID); blobid = g_blobid; - SPDK_CU_ASSERT_FATAL(spdk_bit_array_get(bs->used_clusters, 1)); - SPDK_CU_ASSERT_FATAL(!spdk_bit_array_get(bs->used_clusters, 11)); + SPDK_CU_ASSERT_FATAL(spdk_bit_pool_is_allocated(bs->used_clusters, 1)); + SPDK_CU_ASSERT_FATAL(!spdk_bit_pool_is_allocated(bs->used_clusters, 11)); dev_set_power_failure_thresholds(thresholds); @@ -5462,16 +5528,16 @@ blob_create_snapshot_power_failure(void) poll_threads(); create_snapshot_bserrno = g_bserrno; snapshotid = g_blobid; - SPDK_CU_ASSERT_FATAL(spdk_bit_array_get(bs->used_clusters, 1)); - SPDK_CU_ASSERT_FATAL(!spdk_bit_array_get(bs->used_clusters, 11)); + SPDK_CU_ASSERT_FATAL(spdk_bit_pool_is_allocated(bs->used_clusters, 1)); + SPDK_CU_ASSERT_FATAL(!spdk_bit_pool_is_allocated(bs->used_clusters, 11)); /* Do not shut down cleanly. Assumption is that after create snapshot * reports success, both blobs should be power-fail safe. */ dev_reset_power_failure_event(); ut_bs_dirty_load(&bs, NULL); - SPDK_CU_ASSERT_FATAL(spdk_bit_array_get(bs->used_clusters, 1)); - SPDK_CU_ASSERT_FATAL(!spdk_bit_array_get(bs->used_clusters, 11)); + SPDK_CU_ASSERT_FATAL(spdk_bit_pool_is_allocated(bs->used_clusters, 1)); + SPDK_CU_ASSERT_FATAL(!spdk_bit_pool_is_allocated(bs->used_clusters, 11)); spdk_bs_open_blob(bs, blobid, blob_op_with_handle_complete, NULL); poll_threads(); @@ -6612,6 +6678,7 @@ int main(int argc, char **argv) CU_ADD_TEST(suite, blob_init); CU_ADD_TEST(suite_bs, blob_open); CU_ADD_TEST(suite_bs, blob_create); + CU_ADD_TEST(suite_bs, blob_create_loop); CU_ADD_TEST(suite_bs, blob_create_fail); CU_ADD_TEST(suite_bs, blob_create_internal); CU_ADD_TEST(suite, blob_thin_provision); diff --git a/test/unit/lib/iscsi/common.c b/test/unit/lib/iscsi/common.c index e6631848ab2..11698320f4f 100644 --- a/test/unit/lib/iscsi/common.c +++ b/test/unit/lib/iscsi/common.c @@ -160,7 +160,7 @@ DEFINE_STUB(spdk_scsi_dev_delete_port, int, DEFINE_STUB_V(shutdown_iscsi_conns, (void)); -DEFINE_STUB_V(iscsi_conns_request_logout, (struct spdk_iscsi_tgt_node *target)); +DEFINE_STUB_V(iscsi_conns_request_logout, (struct spdk_iscsi_tgt_node *target, int pg_tag)); DEFINE_STUB(iscsi_get_active_conns, int, (struct spdk_iscsi_tgt_node *target), 0); diff --git a/test/unit/lib/iscsi/conn.c/conn_ut.c b/test/unit/lib/iscsi/conn.c/conn_ut.c index dc016847d98..e77593b377f 100644 --- a/test/unit/lib/iscsi/conn.c/conn_ut.c +++ b/test/unit/lib/iscsi/conn.c/conn_ut.c @@ -56,7 +56,10 @@ struct spdk_scsi_lun { uint8_t reserved; }; -struct spdk_iscsi_globals g_iscsi; +struct spdk_iscsi_globals g_iscsi = { + .MaxLargeDataInPerConnection = DEFAULT_MAX_LARGE_DATAIN_PER_CONNECTION, +}; + static TAILQ_HEAD(read_tasks_head, spdk_iscsi_task) g_ut_read_tasks = TAILQ_HEAD_INITIALIZER(g_ut_read_tasks); static struct spdk_iscsi_task *g_new_task = NULL; @@ -209,8 +212,22 @@ DEFINE_STUB_V(iscsi_task_mgmt_response, DEFINE_STUB_V(iscsi_send_nopin, (struct spdk_iscsi_conn *conn)); -DEFINE_STUB(iscsi_del_transfer_task, bool, - (struct spdk_iscsi_conn *conn, uint32_t task_tag), true); +bool +iscsi_del_transfer_task(struct spdk_iscsi_conn *conn, uint32_t task_tag) +{ + struct spdk_iscsi_task *task; + + task = TAILQ_FIRST(&conn->active_r2t_tasks); + if (task == NULL || task->tag != task_tag) { + return false; + } + + TAILQ_REMOVE(&conn->active_r2t_tasks, task, link); + task->is_r2t_active = false; + iscsi_task_put(task); + + return true; +} DEFINE_STUB(iscsi_handle_incoming_pdus, int, (struct spdk_iscsi_conn *conn), 0); @@ -424,6 +441,7 @@ process_non_read_task_completion_test(void) primary.scsi.ref = 1; TAILQ_INSERT_TAIL(&conn.active_r2t_tasks, &primary, link); primary.is_r2t_active = true; + primary.tag = 1; /* First subtask which failed. */ task.scsi.length = 4096; @@ -555,7 +573,7 @@ free_tasks_on_connection(void) TAILQ_INIT(&conn.write_pdu_list); TAILQ_INIT(&conn.snack_pdu_list); TAILQ_INIT(&conn.queued_datain_tasks); - conn.data_in_cnt = MAX_LARGE_DATAIN_PER_CONNECTION; + conn.data_in_cnt = g_iscsi.MaxLargeDataInPerConnection; pdu1.task = &task1; pdu2.task = &task2; @@ -709,7 +727,7 @@ abort_queued_datain_task_test(void) TAILQ_INSERT_TAIL(&conn.queued_datain_tasks, &task, link); /* No slots for sub read tasks */ - conn.data_in_cnt = MAX_LARGE_DATAIN_PER_CONNECTION; + conn.data_in_cnt = g_iscsi.MaxLargeDataInPerConnection; rc = _iscsi_conn_abort_queued_datain_task(&conn, &task); CU_ASSERT(rc != 0); CU_ASSERT(!TAILQ_EMPTY(&conn.queued_datain_tasks)); @@ -732,7 +750,7 @@ abort_queued_datain_task_test(void) TAILQ_INSERT_TAIL(&conn.queued_datain_tasks, &task, link); /* No slots for sub read tasks */ - conn.data_in_cnt = MAX_LARGE_DATAIN_PER_CONNECTION; + conn.data_in_cnt = g_iscsi.MaxLargeDataInPerConnection; rc = _iscsi_conn_abort_queued_datain_task(&conn, &task); CU_ASSERT(rc != 0); CU_ASSERT(!TAILQ_EMPTY(&conn.queued_datain_tasks)); diff --git a/test/unit/lib/iscsi/iscsi.c/iscsi_ut.c b/test/unit/lib/iscsi/iscsi.c/iscsi_ut.c index 0d70485d73a..a102ceda56b 100644 --- a/test/unit/lib/iscsi/iscsi.c/iscsi_ut.c +++ b/test/unit/lib/iscsi/iscsi.c/iscsi_ut.c @@ -79,8 +79,13 @@ iscsi_tgt_node_access(struct spdk_iscsi_conn *conn, } } +DEFINE_STUB(iscsi_tgt_node_is_redirected, bool, + (struct spdk_iscsi_conn *conn, struct spdk_iscsi_tgt_node *target, + char *buf, int buf_len), + false); + DEFINE_STUB(iscsi_send_tgts, int, - (struct spdk_iscsi_conn *conn, const char *iiqn, const char *iaddr, + (struct spdk_iscsi_conn *conn, const char *iiqn, const char *tiqn, uint8_t *data, int alloc_len, int data_len), 0); @@ -275,6 +280,8 @@ maxburstlength_test(void) struct spdk_iscsi_pdu *response_pdu; int rc; + g_iscsi.MaxR2TPerConnection = DEFAULT_MAXR2T; + req_pdu = iscsi_get_pdu(&conn); data_out_pdu = iscsi_get_pdu(&conn); @@ -649,6 +656,8 @@ add_transfer_task_test(void) int rc, count = 0; uint32_t buffer_offset, desired_xfer_len; + g_iscsi.MaxR2TPerConnection = DEFAULT_MAXR2T; + sess.MaxBurstLength = SPDK_ISCSI_MAX_BURST_LENGTH; /* 1M */ sess.MaxOutstandingR2T = DEFAULT_MAXR2T; /* 4 */ @@ -687,7 +696,6 @@ add_transfer_task_test(void) CU_ASSERT(conn.data_out_cnt == 255); CU_ASSERT(conn.pending_r2t == 1); - CU_ASSERT(conn.outstanding_r2t_tasks[0] == &task); CU_ASSERT(conn.ttt == 1); CU_ASSERT(task.data_out_cnt == 255); @@ -779,7 +787,7 @@ del_transfer_task_test(void) { struct spdk_iscsi_sess sess = {}; struct spdk_iscsi_conn conn = {}; - struct spdk_iscsi_task task1 = {}, task2 = {}, task3 = {}, task4 = {}, task5 = {}, *task; + struct spdk_iscsi_task *task1, *task2, *task3, *task4, *task5; struct spdk_iscsi_pdu *pdu1, *pdu2, *pdu3, *pdu4, *pdu5, *pdu; int rc; @@ -794,83 +802,100 @@ del_transfer_task_test(void) SPDK_CU_ASSERT_FATAL(pdu1 != NULL); pdu1->data_segment_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; - task1.scsi.transfer_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; - iscsi_task_set_pdu(&task1, pdu1); - task1.tag = 11; - rc = add_transfer_task(&conn, &task1); + task1 = iscsi_task_get(&conn, NULL, NULL); + SPDK_CU_ASSERT_FATAL(task1 != NULL); + + task1->scsi.transfer_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; + iscsi_task_set_pdu(task1, pdu1); + task1->tag = 11; + + rc = add_transfer_task(&conn, task1); CU_ASSERT(rc == 0); pdu2 = iscsi_get_pdu(&conn); SPDK_CU_ASSERT_FATAL(pdu2 != NULL); pdu2->data_segment_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; - task2.scsi.transfer_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; - iscsi_task_set_pdu(&task2, pdu2); - task2.tag = 12; - rc = add_transfer_task(&conn, &task2); + task2 = iscsi_task_get(&conn, NULL, NULL); + SPDK_CU_ASSERT_FATAL(task2 != NULL); + + task2->scsi.transfer_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; + iscsi_task_set_pdu(task2, pdu2); + task2->tag = 12; + + rc = add_transfer_task(&conn, task2); CU_ASSERT(rc == 0); pdu3 = iscsi_get_pdu(&conn); SPDK_CU_ASSERT_FATAL(pdu3 != NULL); pdu3->data_segment_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; - task3.scsi.transfer_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; - iscsi_task_set_pdu(&task3, pdu3); - task3.tag = 13; - rc = add_transfer_task(&conn, &task3); + task3 = iscsi_task_get(&conn, NULL, NULL); + SPDK_CU_ASSERT_FATAL(task3 != NULL); + + task3->scsi.transfer_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; + iscsi_task_set_pdu(task3, pdu3); + task3->tag = 13; + + rc = add_transfer_task(&conn, task3); CU_ASSERT(rc == 0); pdu4 = iscsi_get_pdu(&conn); SPDK_CU_ASSERT_FATAL(pdu4 != NULL); pdu4->data_segment_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; - task4.scsi.transfer_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; - iscsi_task_set_pdu(&task4, pdu4); - task4.tag = 14; - rc = add_transfer_task(&conn, &task4); + task4 = iscsi_task_get(&conn, NULL, NULL); + SPDK_CU_ASSERT_FATAL(task4 != NULL); + + task4->scsi.transfer_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; + iscsi_task_set_pdu(task4, pdu4); + task4->tag = 14; + + rc = add_transfer_task(&conn, task4); CU_ASSERT(rc == 0); pdu5 = iscsi_get_pdu(&conn); SPDK_CU_ASSERT_FATAL(pdu5 != NULL); pdu5->data_segment_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; - task5.scsi.transfer_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; - iscsi_task_set_pdu(&task5, pdu5); - task5.tag = 15; - rc = add_transfer_task(&conn, &task5); + task5 = iscsi_task_get(&conn, NULL, NULL); + SPDK_CU_ASSERT_FATAL(task5 != NULL); + + task5->scsi.transfer_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; + iscsi_task_set_pdu(task5, pdu5); + task5->tag = 15; + + rc = add_transfer_task(&conn, task5); CU_ASSERT(rc == 0); - CU_ASSERT(get_transfer_task(&conn, 1) == &task1); + CU_ASSERT(get_transfer_task(&conn, 1) == task1); CU_ASSERT(get_transfer_task(&conn, 5) == NULL); iscsi_del_transfer_task(&conn, 11); CU_ASSERT(get_transfer_task(&conn, 1) == NULL); - CU_ASSERT(get_transfer_task(&conn, 5) == &task5); + CU_ASSERT(get_transfer_task(&conn, 5) == task5); - CU_ASSERT(get_transfer_task(&conn, 2) == &task2); + CU_ASSERT(get_transfer_task(&conn, 2) == task2); iscsi_del_transfer_task(&conn, 12); CU_ASSERT(get_transfer_task(&conn, 2) == NULL); - CU_ASSERT(get_transfer_task(&conn, 3) == &task3); + CU_ASSERT(get_transfer_task(&conn, 3) == task3); iscsi_del_transfer_task(&conn, 13); CU_ASSERT(get_transfer_task(&conn, 3) == NULL); - CU_ASSERT(get_transfer_task(&conn, 4) == &task4); + CU_ASSERT(get_transfer_task(&conn, 4) == task4); iscsi_del_transfer_task(&conn, 14); CU_ASSERT(get_transfer_task(&conn, 4) == NULL); - CU_ASSERT(get_transfer_task(&conn, 5) == &task5); + CU_ASSERT(get_transfer_task(&conn, 5) == task5); iscsi_del_transfer_task(&conn, 15); CU_ASSERT(get_transfer_task(&conn, 5) == NULL); - while (!TAILQ_EMPTY(&conn.active_r2t_tasks)) { - task = TAILQ_FIRST(&conn.active_r2t_tasks); - TAILQ_REMOVE(&conn.active_r2t_tasks, task, link); - } + CU_ASSERT(TAILQ_EMPTY(&conn.active_r2t_tasks)); while (!TAILQ_EMPTY(&g_write_pdu_list)) { pdu = TAILQ_FIRST(&g_write_pdu_list); @@ -1854,6 +1879,7 @@ pdu_hdr_op_data_test(void) conn.sess = &sess; conn.dev = &dev; + TAILQ_INIT(&conn.active_r2t_tasks); /* Case 1 - SCSI Data-Out PDU is acceptable only on normal session. */ sess.session_type = SESSION_TYPE_DISCOVERY; @@ -1881,7 +1907,7 @@ pdu_hdr_op_data_test(void) */ primary.desired_data_transfer_length = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH - 1; conn.pending_r2t = 1; - conn.outstanding_r2t_tasks[0] = &primary; + TAILQ_INSERT_TAIL(&conn.active_r2t_tasks, &primary, link); rc = iscsi_pdu_hdr_op_data(&conn, &pdu); CU_ASSERT(rc == SPDK_ISCSI_CONNECTION_FATAL); diff --git a/test/unit/lib/iscsi/portal_grp.c/portal_grp_ut.c b/test/unit/lib/iscsi/portal_grp.c/portal_grp_ut.c index a89a1567f48..bedde06b179 100644 --- a/test/unit/lib/iscsi/portal_grp.c/portal_grp_ut.c +++ b/test/unit/lib/iscsi/portal_grp.c/portal_grp_ut.c @@ -223,7 +223,7 @@ portal_grp_register_unregister_case(void) const char *host = "192.168.2.0"; const char *port = "3260"; - pg1 = iscsi_portal_grp_create(1); + pg1 = iscsi_portal_grp_create(1, false); CU_ASSERT(pg1 != NULL); p = iscsi_portal_create(host, port); @@ -254,7 +254,7 @@ portal_grp_register_twice_case(void) const char *host = "192.168.2.0"; const char *port = "3260"; - pg1 = iscsi_portal_grp_create(1); + pg1 = iscsi_portal_grp_create(1, false); CU_ASSERT(pg1 != NULL); p = iscsi_portal_create(host, port); @@ -294,7 +294,7 @@ portal_grp_add_delete_case(void) set_thread(0); /* internal of iscsi_create_portal_group */ - pg1 = iscsi_portal_grp_create(1); + pg1 = iscsi_portal_grp_create(1, false); CU_ASSERT(pg1 != NULL); p = iscsi_portal_create(host, port); @@ -340,7 +340,7 @@ portal_grp_add_delete_twice_case(void) set_thread(0); /* internal of iscsi_create_portal_group related */ - pg1 = iscsi_portal_grp_create(1); + pg1 = iscsi_portal_grp_create(1, false); CU_ASSERT(pg1 != NULL); p = iscsi_portal_create(host, port1); @@ -356,7 +356,7 @@ portal_grp_add_delete_twice_case(void) CU_ASSERT(rc == 0); /* internal of iscsi_create_portal_group related */ - pg2 = iscsi_portal_grp_create(2); + pg2 = iscsi_portal_grp_create(2, false); CU_ASSERT(pg2 != NULL); p = iscsi_portal_create(host, port2); diff --git a/test/unit/lib/nvme/nvme.c/nvme_ut.c b/test/unit/lib/nvme/nvme.c/nvme_ut.c index cf51a14bd99..482bf3b002b 100644 --- a/test/unit/lib/nvme/nvme.c/nvme_ut.c +++ b/test/unit/lib/nvme/nvme.c/nvme_ut.c @@ -1233,15 +1233,15 @@ test_nvme_request_check_timeout(void) } struct nvme_completion_poll_status g_status; -uint64_t completion_delay, timeout_in_secs; +uint64_t completion_delay_us, timeout_in_usecs; int g_process_comp_result; int spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) { - spdk_delay_us(completion_delay * spdk_get_ticks_hz()); + spdk_delay_us(completion_delay_us); - g_status.done = completion_delay < timeout_in_secs && g_process_comp_result == 0 ? true : false; + g_status.done = completion_delay_us < timeout_in_usecs && g_process_comp_result == 0 ? true : false; return g_process_comp_result; } @@ -1256,9 +1256,9 @@ test_nvme_wait_for_completion(void) /* completion timeout */ memset(&g_status, 0, sizeof(g_status)); - completion_delay = 2; - timeout_in_secs = 1; - rc = nvme_wait_for_completion_timeout(&qpair, &g_status, timeout_in_secs); + completion_delay_us = 2000000; + timeout_in_usecs = 1000000; + rc = nvme_wait_for_completion_timeout(&qpair, &g_status, timeout_in_usecs); CU_ASSERT(g_status.timed_out == true); CU_ASSERT(g_status.done == false); CU_ASSERT(rc == -ECANCELED); @@ -1266,9 +1266,9 @@ test_nvme_wait_for_completion(void) /* spdk_nvme_qpair_process_completions returns error */ memset(&g_status, 0, sizeof(g_status)); g_process_comp_result = -1; - completion_delay = 1; - timeout_in_secs = 2; - rc = nvme_wait_for_completion_timeout(&qpair, &g_status, timeout_in_secs); + completion_delay_us = 1000000; + timeout_in_usecs = 2000000; + rc = nvme_wait_for_completion_timeout(&qpair, &g_status, timeout_in_usecs); CU_ASSERT(rc == -ECANCELED); CU_ASSERT(g_status.timed_out == true); CU_ASSERT(g_status.done == false); @@ -1279,9 +1279,9 @@ test_nvme_wait_for_completion(void) /* complete in time */ memset(&g_status, 0, sizeof(g_status)); - completion_delay = 1; - timeout_in_secs = 2; - rc = nvme_wait_for_completion_timeout(&qpair, &g_status, timeout_in_secs); + completion_delay_us = 1000000; + timeout_in_usecs = 2000000; + rc = nvme_wait_for_completion_timeout(&qpair, &g_status, timeout_in_usecs); CU_ASSERT(g_status.timed_out == false); CU_ASSERT(g_status.done == true); CU_ASSERT(rc == 0); diff --git a/test/unit/lib/nvme/nvme_ctrlr.c/nvme_ctrlr_ut.c b/test/unit/lib/nvme/nvme_ctrlr.c/nvme_ctrlr_ut.c index 5e20fae457f..20bd5d4403e 100644 --- a/test/unit/lib/nvme/nvme_ctrlr.c/nvme_ctrlr_ut.c +++ b/test/unit/lib/nvme/nvme_ctrlr.c/nvme_ctrlr_ut.c @@ -318,7 +318,7 @@ nvme_wait_for_completion(struct spdk_nvme_qpair *qpair, int nvme_wait_for_completion_timeout(struct spdk_nvme_qpair *qpair, struct nvme_completion_poll_status *status, - uint64_t timeout_in_secs) + uint64_t timeout_in_usecs) { return nvme_wait_for_completion_robust_lock(qpair, status, NULL); } @@ -1516,6 +1516,7 @@ test_nvme_ctrlr_construct_intel_support_log_page_list(void) CU_ASSERT(res == false); /* Set the vendor to Intel, but provide no device id */ + pci_id.class_id = SPDK_PCI_CLASS_NVME; ctrlr.cdata.vid = pci_id.vendor_id = SPDK_PCI_VID_INTEL; payload.temperature_statistics_log_len = 1; ctrlr.quirks = nvme_get_quirks(&pci_id); diff --git a/test/unit/lib/nvme/nvme_ns.c/nvme_ns_ut.c b/test/unit/lib/nvme/nvme_ns.c/nvme_ns_ut.c index 22c59e06c4f..a7f98ad1500 100644 --- a/test/unit/lib/nvme/nvme_ns.c/nvme_ns_ut.c +++ b/test/unit/lib/nvme/nvme_ns.c/nvme_ns_ut.c @@ -46,6 +46,9 @@ DEFINE_STUB(nvme_wait_for_completion_robust_lock, int, struct nvme_completion_poll_status *status, pthread_mutex_t *robust_mutex), 0); +DEFINE_STUB(spdk_nvme_ctrlr_is_active_ns, bool, + (struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid), 1); + int nvme_ctrlr_cmd_identify(struct spdk_nvme_ctrlr *ctrlr, uint8_t cns, uint16_t cntid, uint32_t nsid, void *payload, size_t payload_size, @@ -132,6 +135,49 @@ test_nvme_ns_uuid(void) CU_ASSERT(memcmp(uuid, &expected_uuid, sizeof(*uuid)) == 0); } +static void +test_nvme_ns_csi(void) +{ + struct spdk_nvme_ctrlr ctrlr = {}; + struct spdk_nvme_ns ns = { .ctrlr = &ctrlr }; + enum spdk_nvme_csi csi; + + /* Empty list - SPDK_NVME_CSI_NVM should be returned */ + memset(ns.id_desc_list, 0, sizeof(ns.id_desc_list)); + csi = spdk_nvme_ns_get_csi(&ns); + CU_ASSERT(csi == SPDK_NVME_CSI_NVM); + + /* NVM CSI - SPDK_NVME_CSI_NVM should be returned */ + memset(ns.id_desc_list, 0, sizeof(ns.id_desc_list)); + ns.id_desc_list[0] = 0x4; /* NIDT == CSI */ + ns.id_desc_list[1] = 0x1; /* NIDL */ + ns.id_desc_list[4] = 0x0; /* SPDK_NVME_CSI_NVM */ + csi = spdk_nvme_ns_get_csi(&ns); + CU_ASSERT(csi == SPDK_NVME_CSI_NVM); + + /* NGUID followed by ZNS CSI - SPDK_NVME_CSI_ZNS should be returned */ + memset(ns.id_desc_list, 0, sizeof(ns.id_desc_list)); + ns.id_desc_list[0] = 0x02; /* NIDT == NGUID */ + ns.id_desc_list[1] = 0x10; /* NIDL */ + memset(&ns.id_desc_list[4], 0xCC, 0x10); + ns.id_desc_list[20] = 0x4; /* NIDT == CSI */ + ns.id_desc_list[21] = 0x1; /* NIDL */ + ns.id_desc_list[24] = 0x2; /* SPDK_NVME_CSI_ZNS */ + csi = spdk_nvme_ns_get_csi(&ns); + CU_ASSERT(csi == SPDK_NVME_CSI_ZNS); + + /* KV CSI followed by NGUID - SPDK_NVME_CSI_KV should be returned */ + memset(ns.id_desc_list, 0, sizeof(ns.id_desc_list)); + ns.id_desc_list[0] = 0x4; /* NIDT == CSI */ + ns.id_desc_list[1] = 0x1; /* NIDL */ + ns.id_desc_list[4] = 0x1; /* SPDK_NVME_CSI_KV */ + ns.id_desc_list[5] = 0x02; /* NIDT == NGUID */ + ns.id_desc_list[6] = 0x10; /* NIDL */ + memset(&ns.id_desc_list[9], 0xCC, 0x10); + csi = spdk_nvme_ns_get_csi(&ns); + CU_ASSERT(csi == SPDK_NVME_CSI_KV); +} + int main(int argc, char **argv) { CU_pSuite suite = NULL; @@ -144,6 +190,7 @@ int main(int argc, char **argv) CU_ADD_TEST(suite, test_nvme_ns_construct); CU_ADD_TEST(suite, test_nvme_ns_uuid); + CU_ADD_TEST(suite, test_nvme_ns_csi); CU_basic_set_mode(CU_BRM_VERBOSE); CU_basic_run_tests(); diff --git a/test/unit/lib/nvme/nvme_qpair.c/nvme_qpair_ut.c b/test/unit/lib/nvme/nvme_qpair.c/nvme_qpair_ut.c index e34c7041334..02c4ed1708e 100644 --- a/test/unit/lib/nvme/nvme_qpair.c/nvme_qpair_ut.c +++ b/test/unit/lib/nvme/nvme_qpair.c/nvme_qpair_ut.c @@ -111,6 +111,7 @@ test3(void) struct nvme_request *req; struct spdk_nvme_ctrlr ctrlr = {}; + qpair.state = NVME_QPAIR_ENABLED; prepare_submit_request_test(&qpair, &ctrlr); req = nvme_allocate_request_null(&qpair, expected_success_callback, NULL); @@ -495,15 +496,10 @@ test_nvme_qpair_add_cmd_error_injection(void) cleanup_submit_request_test(&qpair); } -static void -test_nvme_qpair_submit_request(void) +static struct nvme_request * +allocate_request_tree(struct spdk_nvme_qpair *qpair) { - int rc; - struct spdk_nvme_qpair qpair = {}; - struct spdk_nvme_ctrlr ctrlr = {}; - struct nvme_request *req, *req1, *req2, *req3, *req2_1, *req2_2, *req2_3; - - prepare_submit_request_test(&qpair, &ctrlr); + struct nvme_request *req, *req1, *req2, *req3, *req2_1, *req2_2, *req2_3; /* * Build a request chain like the following: @@ -517,51 +513,71 @@ test_nvme_qpair_submit_request(void) * | | | * req2_1 req2_2 req2_3 */ - req = nvme_allocate_request_null(&qpair, NULL, NULL); + req = nvme_allocate_request_null(qpair, NULL, NULL); CU_ASSERT(req != NULL); TAILQ_INIT(&req->children); - req1 = nvme_allocate_request_null(&qpair, NULL, NULL); + req1 = nvme_allocate_request_null(qpair, NULL, NULL); CU_ASSERT(req1 != NULL); req->num_children++; TAILQ_INSERT_TAIL(&req->children, req1, child_tailq); req1->parent = req; - req2 = nvme_allocate_request_null(&qpair, NULL, NULL); + req2 = nvme_allocate_request_null(qpair, NULL, NULL); CU_ASSERT(req2 != NULL); TAILQ_INIT(&req2->children); req->num_children++; TAILQ_INSERT_TAIL(&req->children, req2, child_tailq); req2->parent = req; - req3 = nvme_allocate_request_null(&qpair, NULL, NULL); + req3 = nvme_allocate_request_null(qpair, NULL, NULL); CU_ASSERT(req3 != NULL); req->num_children++; TAILQ_INSERT_TAIL(&req->children, req3, child_tailq); req3->parent = req; - req2_1 = nvme_allocate_request_null(&qpair, NULL, NULL); + req2_1 = nvme_allocate_request_null(qpair, NULL, NULL); CU_ASSERT(req2_1 != NULL); req2->num_children++; TAILQ_INSERT_TAIL(&req2->children, req2_1, child_tailq); req2_1->parent = req2; - req2_2 = nvme_allocate_request_null(&qpair, NULL, NULL); + req2_2 = nvme_allocate_request_null(qpair, NULL, NULL); CU_ASSERT(req2_2 != NULL); req2->num_children++; TAILQ_INSERT_TAIL(&req2->children, req2_2, child_tailq); req2_2->parent = req2; - req2_3 = nvme_allocate_request_null(&qpair, NULL, NULL); + req2_3 = nvme_allocate_request_null(qpair, NULL, NULL); CU_ASSERT(req2_3 != NULL); req2->num_children++; TAILQ_INSERT_TAIL(&req2->children, req2_3, child_tailq); req2_3->parent = req2; + return req; +} + +static void +test_nvme_qpair_submit_request(void) +{ + int rc; + struct spdk_nvme_qpair qpair = {}; + struct spdk_nvme_ctrlr ctrlr = {}; + struct nvme_request *req; + + prepare_submit_request_test(&qpair, &ctrlr); + + req = allocate_request_tree(&qpair); ctrlr.is_failed = true; rc = nvme_qpair_submit_request(&qpair, req); SPDK_CU_ASSERT_FATAL(rc == -ENXIO); + req = allocate_request_tree(&qpair); + ctrlr.is_failed = false; + qpair.state = NVME_QPAIR_DISCONNECTING; + rc = nvme_qpair_submit_request(&qpair, req); + SPDK_CU_ASSERT_FATAL(rc == -ENXIO); + cleanup_submit_request_test(&qpair); } diff --git a/test/unit/lib/nvme/nvme_quirks.c/nvme_quirks_ut.c b/test/unit/lib/nvme/nvme_quirks.c/nvme_quirks_ut.c index 1054d6c2622..c3e799251bf 100644 --- a/test/unit/lib/nvme/nvme_quirks.c/nvme_quirks_ut.c +++ b/test/unit/lib/nvme/nvme_quirks.c/nvme_quirks_ut.c @@ -48,6 +48,7 @@ test_nvme_quirks_striping(void) CU_ASSERT((quirks & NVME_INTEL_QUIRK_STRIPING) == 0); /* Set the vendor id to Intel, but no device id. No striping. */ + pci_id.class_id = SPDK_PCI_CLASS_NVME; pci_id.vendor_id = SPDK_PCI_VID_INTEL; quirks = nvme_get_quirks(&pci_id); CU_ASSERT((quirks & NVME_INTEL_QUIRK_STRIPING) == 0); diff --git a/test/unit/lib/nvme/nvme_rdma.c/nvme_rdma_ut.c b/test/unit/lib/nvme/nvme_rdma.c/nvme_rdma_ut.c index e36da30f20c..18e63a18971 100644 --- a/test/unit/lib/nvme/nvme_rdma.c/nvme_rdma_ut.c +++ b/test/unit/lib/nvme/nvme_rdma.c/nvme_rdma_ut.c @@ -126,6 +126,7 @@ test_nvme_rdma_build_sgl_request(void) ctrlr.max_sges = NVME_RDMA_MAX_SGL_DESCRIPTORS; ctrlr.cdata.nvmf_specific.msdbd = 16; + ctrlr.ioccsz_bytes = 4096; rqpair.mr_map = &rmap; rqpair.qpair.ctrlr = &ctrlr; @@ -193,7 +194,7 @@ test_nvme_rdma_build_sgl_request(void) SPDK_CU_ASSERT_FATAL(rc != 0); CU_ASSERT(bio.iovpos == 1); - /* Test case 4: Multiple SGL, SGL size smaller than I/O size */ + /* Test case 4: Multiple SGL, SGL size smaller than I/O size. Expected: FAIL */ bio.iovpos = 0; req.payload_offset = 0; req.payload_size = 0x6000; @@ -201,6 +202,198 @@ test_nvme_rdma_build_sgl_request(void) rc = nvme_rdma_build_sgl_request(&rqpair, &rdma_req); SPDK_CU_ASSERT_FATAL(rc != 0); CU_ASSERT(bio.iovpos == NVME_RDMA_MAX_SGL_DESCRIPTORS); + + /* Test case 5: SGL length exceeds 3 bytes. Expected: FAIL */ + req.payload_size = 0x1000 + (1 << 24); + bio.iovs[0].iov_len = 0x1000; + bio.iovs[1].iov_len = 1 << 24; + rc = nvme_rdma_build_sgl_request(&rqpair, &rdma_req); + SPDK_CU_ASSERT_FATAL(rc != 0); + + /* Test case 6: 4 SGL descriptors, size of SGL descriptors exceeds ICD. Expected: FAIL */ + ctrlr.ioccsz_bytes = 60; + bio.iovpos = 0; + req.payload_offset = 0; + req.payload_size = 0x4000; + for (i = 0; i < 4; i++) { + bio.iovs[i].iov_len = 0x1000; + } + rc = nvme_rdma_build_sgl_request(&rqpair, &rdma_req); + SPDK_CU_ASSERT_FATAL(rc == -1); +} + +static void +test_nvme_rdma_build_sgl_inline_request(void) +{ + struct nvme_rdma_qpair rqpair; + struct spdk_nvme_ctrlr ctrlr = {0}; + struct spdk_nvmf_cmd cmd = {{0}}; + struct spdk_nvme_rdma_req rdma_req = {0}; + struct nvme_request req = {{0}}; + struct nvme_rdma_ut_bdev_io bio; + struct spdk_nvme_rdma_mr_map rmap = {0}; + struct spdk_mem_map *map = NULL; + int rc; + + rmap.map = map; + + ctrlr.max_sges = NVME_RDMA_MAX_SGL_DESCRIPTORS; + ctrlr.cdata.nvmf_specific.msdbd = 16; + + rqpair.mr_map = &rmap; + rqpair.qpair.ctrlr = &ctrlr; + rqpair.cmds = &cmd; + cmd.sgl[0].address = 0x1111; + rdma_req.id = 0; + rdma_req.req = &req; + + req.payload.reset_sgl_fn = nvme_rdma_ut_reset_sgl; + req.payload.next_sge_fn = nvme_rdma_ut_next_sge; + req.payload.contig_or_cb_arg = &bio; + req.qpair = &rqpair.qpair; + + g_nvme_rdma_mr.lkey = 2; + + /* Test case 1: single inline SGL. Expected: PASS */ + bio.iovpos = 0; + req.payload_offset = 0; + req.payload_size = 0x1000; + bio.iovs[0].iov_base = (void *)0xdeadbeef; + bio.iovs[0].iov_len = 0x1000; + rc = nvme_rdma_build_sgl_inline_request(&rqpair, &rdma_req); + SPDK_CU_ASSERT_FATAL(rc == 0); + CU_ASSERT(bio.iovpos == 1); + CU_ASSERT(req.cmd.dptr.sgl1.unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK); + CU_ASSERT(req.cmd.dptr.sgl1.unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET); + CU_ASSERT(req.cmd.dptr.sgl1.unkeyed.length == req.payload_size); + CU_ASSERT(req.cmd.dptr.sgl1.address == 0); + CU_ASSERT(rdma_req.send_sgl[0].length == sizeof(struct spdk_nvme_cmd)); + CU_ASSERT(rdma_req.send_sgl[1].length == req.payload_size); + CU_ASSERT(rdma_req.send_sgl[1].addr == (uint64_t)bio.iovs[0].iov_base); + CU_ASSERT(rdma_req.send_sgl[1].lkey == g_nvme_rdma_mr.lkey); + + /* Test case 2: SGL length exceeds 3 bytes. Expected: PASS */ + bio.iovpos = 0; + req.payload_offset = 0; + req.payload_size = 1 << 24; + bio.iovs[0].iov_len = 1 << 24; + rc = nvme_rdma_build_sgl_inline_request(&rqpair, &rdma_req); + SPDK_CU_ASSERT_FATAL(rc == 0); + CU_ASSERT(bio.iovpos == 1); + CU_ASSERT(req.cmd.dptr.sgl1.unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK); + CU_ASSERT(req.cmd.dptr.sgl1.unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET); + CU_ASSERT(req.cmd.dptr.sgl1.unkeyed.length == req.payload_size); + CU_ASSERT(req.cmd.dptr.sgl1.address == 0); + CU_ASSERT(rdma_req.send_sgl[0].length == sizeof(struct spdk_nvme_cmd)); + CU_ASSERT(rdma_req.send_sgl[1].length == req.payload_size); + CU_ASSERT(rdma_req.send_sgl[1].addr == (uint64_t)bio.iovs[0].iov_base); + CU_ASSERT(rdma_req.send_sgl[1].lkey == g_nvme_rdma_mr.lkey); +} + +static void +test_nvme_rdma_build_contig_request(void) +{ + struct nvme_rdma_qpair rqpair; + struct spdk_nvme_ctrlr ctrlr = {0}; + struct spdk_nvmf_cmd cmd = {{0}}; + struct spdk_nvme_rdma_req rdma_req = {0}; + struct nvme_request req = {{0}}; + struct spdk_nvme_rdma_mr_map rmap = {0}; + struct spdk_mem_map *map = NULL; + int rc; + + rmap.map = map; + + ctrlr.max_sges = NVME_RDMA_MAX_SGL_DESCRIPTORS; + ctrlr.cdata.nvmf_specific.msdbd = 16; + + rqpair.mr_map = &rmap; + rqpair.qpair.ctrlr = &ctrlr; + rqpair.cmds = &cmd; + cmd.sgl[0].address = 0x1111; + rdma_req.id = 0; + rdma_req.req = &req; + + req.payload.contig_or_cb_arg = (void *)0xdeadbeef; + req.qpair = &rqpair.qpair; + + g_nvme_rdma_mr.rkey = 2; + + /* Test case 1: contig request. Expected: PASS */ + req.payload_offset = 0; + req.payload_size = 0x1000; + rc = nvme_rdma_build_contig_request(&rqpair, &rdma_req); + SPDK_CU_ASSERT_FATAL(rc == 0); + CU_ASSERT(req.cmd.dptr.sgl1.keyed.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK); + CU_ASSERT(req.cmd.dptr.sgl1.keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS); + CU_ASSERT(req.cmd.dptr.sgl1.keyed.length == req.payload_size); + CU_ASSERT(req.cmd.dptr.sgl1.keyed.key == g_nvme_rdma_mr.rkey); + CU_ASSERT(req.cmd.dptr.sgl1.address == (uint64_t)req.payload.contig_or_cb_arg); + CU_ASSERT(rdma_req.send_sgl[0].length == sizeof(struct spdk_nvme_cmd)); + + /* Test case 2: SGL length exceeds 3 bytes. Expected: FAIL */ + req.payload_offset = 0; + req.payload_size = 1 << 24; + rc = nvme_rdma_build_contig_request(&rqpair, &rdma_req); + SPDK_CU_ASSERT_FATAL(rc != 0); +} + +static void +test_nvme_rdma_build_contig_inline_request(void) +{ + struct nvme_rdma_qpair rqpair; + struct spdk_nvme_ctrlr ctrlr = {0}; + struct spdk_nvmf_cmd cmd = {{0}}; + struct spdk_nvme_rdma_req rdma_req = {0}; + struct nvme_request req = {{0}}; + struct spdk_nvme_rdma_mr_map rmap = {0}; + struct spdk_mem_map *map = NULL; + int rc; + + rmap.map = map; + + ctrlr.max_sges = NVME_RDMA_MAX_SGL_DESCRIPTORS; + ctrlr.cdata.nvmf_specific.msdbd = 16; + + rqpair.mr_map = &rmap; + rqpair.qpair.ctrlr = &ctrlr; + rqpair.cmds = &cmd; + cmd.sgl[0].address = 0x1111; + rdma_req.id = 0; + rdma_req.req = &req; + + req.payload.contig_or_cb_arg = (void *)0xdeadbeef; + req.qpair = &rqpair.qpair; + + g_nvme_rdma_mr.rkey = 2; + + /* Test case 1: single inline SGL. Expected: PASS */ + req.payload_offset = 0; + req.payload_size = 0x1000; + rc = nvme_rdma_build_contig_inline_request(&rqpair, &rdma_req); + SPDK_CU_ASSERT_FATAL(rc == 0); + CU_ASSERT(req.cmd.dptr.sgl1.unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK); + CU_ASSERT(req.cmd.dptr.sgl1.unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET); + CU_ASSERT(req.cmd.dptr.sgl1.unkeyed.length == req.payload_size); + CU_ASSERT(req.cmd.dptr.sgl1.address == 0); + CU_ASSERT(rdma_req.send_sgl[0].length == sizeof(struct spdk_nvme_cmd)); + CU_ASSERT(rdma_req.send_sgl[1].length == req.payload_size); + CU_ASSERT(rdma_req.send_sgl[1].addr == (uint64_t)req.payload.contig_or_cb_arg); + CU_ASSERT(rdma_req.send_sgl[1].lkey == g_nvme_rdma_mr.lkey); + + /* Test case 2: SGL length exceeds 3 bytes. Expected: PASS */ + req.payload_offset = 0; + req.payload_size = 1 << 24; + rc = nvme_rdma_build_contig_inline_request(&rqpair, &rdma_req); + SPDK_CU_ASSERT_FATAL(rc == 0); + CU_ASSERT(req.cmd.dptr.sgl1.unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK); + CU_ASSERT(req.cmd.dptr.sgl1.unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET); + CU_ASSERT(req.cmd.dptr.sgl1.unkeyed.length == req.payload_size); + CU_ASSERT(req.cmd.dptr.sgl1.address == 0); + CU_ASSERT(rdma_req.send_sgl[0].length == sizeof(struct spdk_nvme_cmd)); + CU_ASSERT(rdma_req.send_sgl[1].length == req.payload_size); + CU_ASSERT(rdma_req.send_sgl[1].addr == (uint64_t)req.payload.contig_or_cb_arg); + CU_ASSERT(rdma_req.send_sgl[1].lkey == g_nvme_rdma_mr.lkey); } int main(int argc, char **argv) @@ -213,6 +406,9 @@ int main(int argc, char **argv) suite = CU_add_suite("nvme_rdma", NULL, NULL); CU_ADD_TEST(suite, test_nvme_rdma_build_sgl_request); + CU_ADD_TEST(suite, test_nvme_rdma_build_sgl_inline_request); + CU_ADD_TEST(suite, test_nvme_rdma_build_contig_request); + CU_ADD_TEST(suite, test_nvme_rdma_build_contig_inline_request); CU_basic_set_mode(CU_BRM_VERBOSE); CU_basic_run_tests(); diff --git a/test/unit/lib/nvme/nvme_uevent.c/nvme_uevent_ut.c b/test/unit/lib/nvme/nvme_uevent.c/nvme_uevent_ut.c index a9775c98374..c5a94223d5b 100644 --- a/test/unit/lib/nvme/nvme_uevent.c/nvme_uevent_ut.c +++ b/test/unit/lib/nvme/nvme_uevent.c/nvme_uevent_ut.c @@ -47,8 +47,6 @@ enum uevent_parse_event_return_type { uevent_expected_continue = 1 }; -#define SPDK_NVME_UEVENT_SUBSYSTEM_NULL 0xFF - static void test_nvme_uevent_parse_event(void) { @@ -62,19 +60,19 @@ test_nvme_uevent_parse_event(void) /* Case 1: Add wrong non-uio or vfio-pci /devices/pci0000:80/0000:80:01.0/0000:81:00.0/uio/uio0 */ commands = "ACTION=add\0DEVPATH=/devices/pci0000:80/0000:80:01.0/0000:81:00.0/uio/uio0\0SUBSYSTEM= \0DRIVER= \0PCI_SLOT_NAME= \0"; - uevent.subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_NULL; + uevent.subsystem = 0xFF; uevent.action = 0; rc = parse_event(commands, &uevent); - CU_ASSERT(rc == uevent_abnormal_exit); - CU_ASSERT(uevent.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_NULL); + CU_ASSERT(rc == uevent_expected_continue); + CU_ASSERT(uevent.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UNRECOGNIZED); CU_ASSERT(uevent.action == SPDK_NVME_UEVENT_ADD); /* Case 2: Add uio /devices/pci0000:80/0000:80:01.0/0000:81:00.0/uio/uio0 */ commands = "ACTION=add \0DEVPATH=/devices/pci0000:80/0000:80:01.0/0000:81:00.0/uio/uio0\0SUBSYSTEM=uio\0DRIVER=\0PCI_SLOT_NAME= \0"; - uevent.subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_NULL; + uevent.subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_UNRECOGNIZED; uevent.action = 0; rc = parse_event(commands, &uevent); @@ -86,7 +84,7 @@ test_nvme_uevent_parse_event(void) /* Case 3: Remove uio /devices/pci0000:80/0000:80:01.0/0000:81:00.0/uio/uio0 */ commands = "ACTION=remove\0DEVPATH=/devices/pci0000:80/0000:80:01.0/0000:81:00.0/uio/uio0\0SUBSYSTEM=uio\0DRIVER=\0PCI_SLOT_NAME= \0"; - uevent.subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_NULL; + uevent.subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_UNRECOGNIZED; rc = parse_event(commands, &uevent); @@ -96,7 +94,7 @@ test_nvme_uevent_parse_event(void) /* Case 4: Add vfio-pci 0000:81:00.0 */ commands = "ACTION=bind\0DEVPATH=\0SUBSYSTEM= \0DRIVER=vfio-pci\0PCI_SLOT_NAME=0000:81:00.0\0"; - uevent.subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_NULL; + uevent.subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_UNRECOGNIZED; rc = parse_event(commands, &uevent); @@ -106,7 +104,7 @@ test_nvme_uevent_parse_event(void) /* Case 5: Remove vfio-pci 0000:81:00.0 */ commands = "ACTION=remove\0DEVPATH= \0SUBSYSTEM= \0DRIVER=vfio-pci \0PCI_SLOT_NAME=0000:81:00.0\0"; - uevent.subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_NULL; + uevent.subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_UNRECOGNIZED; rc = parse_event(commands, &uevent); @@ -116,7 +114,7 @@ test_nvme_uevent_parse_event(void) /* Case 6: Add wrong vfio-pci addr 000000 */ commands = "ACTION=bind\0DEVPATH= \0SUBSYSTEM= \0DRIVER=vfio-pci \0PCI_SLOT_NAME=000000\0"; - uevent.subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_NULL; + uevent.subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_UNRECOGNIZED; rc = parse_event(commands, &uevent); @@ -126,12 +124,12 @@ test_nvme_uevent_parse_event(void) /* Case 7: Add wrong type vfio 0000:81:00.0 */ commands = "ACTION=bind\0DEVPATH= \0SUBSYSTEM= \0DRIVER=vfio \0PCI_SLOT_NAME=0000:81:00.0\0"; - uevent.subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_NULL; + uevent.subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_UIO; uevent.action = 0; rc = parse_event(commands, &uevent); - CU_ASSERT(rc == uevent_abnormal_exit); - CU_ASSERT(uevent.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_NULL); + CU_ASSERT(rc == uevent_expected_continue); + CU_ASSERT(uevent.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UNRECOGNIZED); CU_ASSERT(uevent.action == SPDK_NVME_UEVENT_ADD); } diff --git a/test/unit/lib/nvmf/ctrlr.c/ctrlr_ut.c b/test/unit/lib/nvmf/ctrlr.c/ctrlr_ut.c index e6ddff9bdad..4878174fa6a 100644 --- a/test/unit/lib/nvmf/ctrlr.c/ctrlr_ut.c +++ b/test/unit/lib/nvmf/ctrlr.c/ctrlr_ut.c @@ -119,6 +119,12 @@ DEFINE_STUB(spdk_nvmf_subsystem_listener_allowed, (struct spdk_nvmf_subsystem *subsystem, const struct spdk_nvme_transport_id *trid), true); +DEFINE_STUB(nvmf_subsystem_find_listener, + struct spdk_nvmf_subsystem_listener *, + (struct spdk_nvmf_subsystem *subsystem, + const struct spdk_nvme_transport_id *trid), + (void *)0x1); + DEFINE_STUB(nvmf_bdev_ctrlr_read_cmd, int, (struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, @@ -179,6 +185,12 @@ DEFINE_STUB(nvmf_bdev_ctrlr_get_dif_ctx, bool, struct spdk_dif_ctx *dif_ctx), true); +DEFINE_STUB_V(nvmf_transport_qpair_abort_request, + (struct spdk_nvmf_qpair *qpair, struct spdk_nvmf_request *req)); + +DEFINE_STUB_V(spdk_nvme_print_command, (uint16_t qid, struct spdk_nvme_cmd *cmd)); +DEFINE_STUB_V(spdk_nvme_print_completion, (uint16_t qid, struct spdk_nvme_cpl *cpl)); + int spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconnect_cb cb_fn, void *ctx) { @@ -938,7 +950,10 @@ test_set_get_features(void) { struct spdk_nvmf_subsystem subsystem = {}; struct spdk_nvmf_qpair admin_qpair = {}; - struct spdk_nvmf_ctrlr ctrlr = { .subsys = &subsystem, .admin_qpair = &admin_qpair }; + struct spdk_nvmf_subsystem_listener listener = {}; + struct spdk_nvmf_ctrlr ctrlr = { + .subsys = &subsystem, .admin_qpair = &admin_qpair, .listener = &listener + }; union nvmf_h2c_msg cmd = {}; union nvmf_c2h_msg rsp = {}; struct spdk_nvmf_ns ns[3]; @@ -948,6 +963,7 @@ test_set_get_features(void) subsystem.ns = ns_arr; subsystem.max_nsid = SPDK_COUNTOF(ns_arr); + listener.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; admin_qpair.ctrlr = &ctrlr; req.qpair = &admin_qpair; cmd.nvme_cmd.nsid = 1; @@ -1509,6 +1525,7 @@ test_fused_compare_and_write(void) struct spdk_nvmf_subsystem subsystem = {}; struct spdk_nvmf_ns ns = {}; struct spdk_nvmf_ns *subsys_ns[1] = {}; + struct spdk_nvmf_subsystem_listener listener = {}; struct spdk_bdev bdev = {}; struct spdk_nvmf_poll_group group = {}; @@ -1522,9 +1539,12 @@ test_fused_compare_and_write(void) subsys_ns[0] = &ns; subsystem.ns = (struct spdk_nvmf_ns **)&subsys_ns; + listener.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; + /* Enable controller */ ctrlr.vcprop.cc.bits.en = 1; ctrlr.subsys = (struct spdk_nvmf_subsystem *)&subsystem; + ctrlr.listener = &listener; group.num_sgroups = 1; sgroups.state = SPDK_NVMF_SUBSYSTEM_ACTIVE; diff --git a/test/unit/lib/nvmf/fc.c/fc_ut.c b/test/unit/lib/nvmf/fc.c/fc_ut.c index 5ae1ac8e0c2..03cabcd36b1 100644 --- a/test/unit/lib/nvmf/fc.c/fc_ut.c +++ b/test/unit/lib/nvmf/fc.c/fc_ut.c @@ -123,10 +123,14 @@ DEFINE_STUB(spdk_bdev_get_block_size, uint32_t, (const struct spdk_bdev *bdev), DEFINE_STUB(spdk_bdev_get_num_blocks, uint64_t, (const struct spdk_bdev *bdev), 1024); DEFINE_STUB(nvmf_ctrlr_async_event_ns_notice, int, (struct spdk_nvmf_ctrlr *ctrlr), 0); +DEFINE_STUB(nvmf_ctrlr_async_event_ana_change_notice, int, + (struct spdk_nvmf_ctrlr *ctrlr), 0); DEFINE_STUB_V(spdk_nvme_trid_populate_transport, (struct spdk_nvme_transport_id *trid, enum spdk_nvme_transport_type trtype)); DEFINE_STUB_V(spdk_nvmf_ctrlr_data_init, (struct spdk_nvmf_transport_opts *opts, struct spdk_nvmf_ctrlr_data *cdata)); +DEFINE_STUB(spdk_nvmf_request_complete, int, (struct spdk_nvmf_request *req), + -ENOSPC); const char * spdk_nvme_transport_id_trtype_str(enum spdk_nvme_transport_type trtype) diff --git a/test/unit/lib/nvmf/rdma.c/rdma_ut.c b/test/unit/lib/nvmf/rdma.c/rdma_ut.c index b08d083cf53..85a319fca3a 100644 --- a/test/unit/lib/nvmf/rdma.c/rdma_ut.c +++ b/test/unit/lib/nvmf/rdma.c/rdma_ut.c @@ -77,6 +77,7 @@ DEFINE_STUB_V(_spdk_trace_record, (uint64_t tsc, uint16_t tpoint_id, uint16_t po DEFINE_STUB_V(spdk_nvmf_ctrlr_data_init, (struct spdk_nvmf_transport_opts *opts, struct spdk_nvmf_ctrlr_data *cdata)); DEFINE_STUB_V(spdk_nvmf_request_exec, (struct spdk_nvmf_request *req)); +DEFINE_STUB(spdk_nvmf_request_complete, int, (struct spdk_nvmf_request *req), 0); DEFINE_STUB(spdk_nvme_transport_id_compare, int, (const struct spdk_nvme_transport_id *trid1, const struct spdk_nvme_transport_id *trid2), 0); DEFINE_STUB_V(nvmf_ctrlr_abort_aer, (struct spdk_nvmf_ctrlr *ctrlr)); @@ -85,6 +86,7 @@ DEFINE_STUB(spdk_nvmf_request_get_dif_ctx, bool, (struct spdk_nvmf_request *req, DEFINE_STUB_V(spdk_nvme_trid_populate_transport, (struct spdk_nvme_transport_id *trid, enum spdk_nvme_transport_type trtype)); DEFINE_STUB_V(spdk_nvmf_tgt_new_qpair, (struct spdk_nvmf_tgt *tgt, struct spdk_nvmf_qpair *qpair)); +DEFINE_STUB(nvmf_ctrlr_abort_request, int, (struct spdk_nvmf_request *req), 0); const char * spdk_nvme_transport_id_trtype_str(enum spdk_nvme_transport_type trtype) @@ -755,6 +757,35 @@ test_spdk_nvmf_rdma_request_process(void) qpair_reset(&rqpair, &poller, &device, &resources); } + /* Test 4, invalid command, check xfer type */ + { + struct spdk_nvmf_rdma_recv *rdma_recv_inv; + struct spdk_nvmf_rdma_request *rdma_req_inv; + /* construct an opcode that specifies BIDIRECTIONAL transfer */ + uint8_t opc = 0x10 | SPDK_NVME_DATA_BIDIRECTIONAL; + + rdma_recv_inv = create_recv(&rqpair, opc); + rdma_req_inv = create_req(&rqpair, rdma_recv_inv); + + /* NEW -> RDMA_REQUEST_STATE_COMPLETING */ + rqpair.current_recv_depth = 1; + progress = nvmf_rdma_request_process(&rtransport, rdma_req_inv); + CU_ASSERT(progress == true); + CU_ASSERT(rdma_req_inv->state == RDMA_REQUEST_STATE_COMPLETING); + CU_ASSERT(rdma_req_inv->req.rsp->nvme_cpl.status.sct == SPDK_NVME_SCT_GENERIC); + CU_ASSERT(rdma_req_inv->req.rsp->nvme_cpl.status.sc == SPDK_NVME_SC_INVALID_OPCODE); + + /* RDMA_REQUEST_STATE_COMPLETED -> FREE */ + rdma_req_inv->state = RDMA_REQUEST_STATE_COMPLETED; + nvmf_rdma_request_process(&rtransport, rdma_req_inv); + CU_ASSERT(rdma_req_inv->state == RDMA_REQUEST_STATE_FREE); + + free_recv(rdma_recv_inv); + free_req(rdma_req_inv); + poller_reset(&poller, &group); + qpair_reset(&rqpair, &poller, &device, &resources); + } + spdk_mempool_free(rtransport.transport.data_buf_pool); spdk_mempool_free(rtransport.data_wr_pool); } diff --git a/test/unit/lib/nvmf/tcp.c/tcp_ut.c b/test/unit/lib/nvmf/tcp.c/tcp_ut.c index 91a7e22f58a..a20ade5f1b3 100644 --- a/test/unit/lib/nvmf/tcp.c/tcp_ut.c +++ b/test/unit/lib/nvmf/tcp.c/tcp_ut.c @@ -83,6 +83,12 @@ DEFINE_STUB(spdk_nvmf_subsystem_listener_allowed, (struct spdk_nvmf_subsystem *subsystem, const struct spdk_nvme_transport_id *trid), true); +DEFINE_STUB(nvmf_subsystem_find_listener, + struct spdk_nvmf_subsystem_listener *, + (struct spdk_nvmf_subsystem *subsystem, + const struct spdk_nvme_transport_id *trid), + (void *)0x1); + DEFINE_STUB_V(nvmf_get_discovery_log_page, (struct spdk_nvmf_tgt *tgt, const char *hostnqn, struct iovec *iov, uint32_t iovcnt, uint64_t offset, uint32_t length)); @@ -163,6 +169,12 @@ DEFINE_STUB(nvmf_bdev_ctrlr_nvme_passthru_io, struct spdk_nvmf_request *req), 0); +DEFINE_STUB(spdk_nvmf_bdev_ctrlr_abort_cmd, + int, + (struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct spdk_nvmf_request *req, struct spdk_nvmf_request *req_to_abort), + 0); + DEFINE_STUB(nvmf_bdev_ctrlr_get_dif_ctx, bool, (struct spdk_bdev *bdev, struct spdk_nvme_cmd *cmd, struct spdk_dif_ctx *dif_ctx), @@ -200,6 +212,12 @@ DEFINE_STUB_V(spdk_nvmf_transport_register, (const struct spdk_nvmf_transport_op DEFINE_STUB_V(spdk_nvmf_tgt_new_qpair, (struct spdk_nvmf_tgt *tgt, struct spdk_nvmf_qpair *qpair)); +DEFINE_STUB_V(nvmf_transport_qpair_abort_request, + (struct spdk_nvmf_qpair *qpair, struct spdk_nvmf_request *req)); + +DEFINE_STUB_V(spdk_nvme_print_command, (uint16_t qid, struct spdk_nvme_cmd *cmd)); +DEFINE_STUB_V(spdk_nvme_print_completion, (uint16_t qid, struct spdk_nvme_cpl *cpl)); + struct spdk_trace_histories *g_trace_histories; struct spdk_bdev { @@ -632,6 +650,7 @@ test_nvmf_tcp_incapsule_data_handle(void) tqpair.qpair.transport = &ttransport.transport; tqpair.state = NVME_TCP_QPAIR_STATE_RUNNING; tqpair.recv_state = NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH; + tqpair.qpair.state = SPDK_NVMF_QPAIR_ACTIVE; /* init a null tcp_req into tqpair TCP_REQUEST_STATE_FREE queue */ tcp_req2.req.qpair = &tqpair.qpair; diff --git a/test/unit/lib/scsi/lun.c/lun_ut.c b/test/unit/lib/scsi/lun.c/lun_ut.c index bce3fa6a375..4efa8e36477 100644 --- a/test/unit/lib/scsi/lun.c/lun_ut.c +++ b/test/unit/lib/scsi/lun.c/lun_ut.c @@ -107,6 +107,7 @@ DEFINE_STUB_V(spdk_scsi_dev_delete_lun, (struct spdk_scsi_dev *dev, struct spdk_scsi_lun *lun)); DEFINE_STUB(scsi_pr_check, int, (struct spdk_scsi_task *task), 0); +DEFINE_STUB(scsi2_reserve_check, int, (struct spdk_scsi_task *task), 0); void bdev_scsi_reset(struct spdk_scsi_task *task) diff --git a/test/unit/lib/scsi/scsi_bdev.c/scsi_bdev_ut.c b/test/unit/lib/scsi/scsi_bdev.c/scsi_bdev_ut.c index cd44dc10136..4e64f707105 100644 --- a/test/unit/lib/scsi/scsi_bdev.c/scsi_bdev_ut.c +++ b/test/unit/lib/scsi/scsi_bdev.c/scsi_bdev_ut.c @@ -104,6 +104,9 @@ DEFINE_STUB(scsi_pr_out, int, (struct spdk_scsi_task *task, DEFINE_STUB(scsi_pr_in, int, (struct spdk_scsi_task *task, uint8_t *cdb, uint8_t *data, uint16_t data_len), 0); +DEFINE_STUB(scsi2_reserve, int, (struct spdk_scsi_task *task, uint8_t *cdb), 0); +DEFINE_STUB(scsi2_release, int, (struct spdk_scsi_task *task), 0); + void scsi_lun_complete_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) { diff --git a/test/unit/lib/scsi/scsi_pr.c/scsi_pr_ut.c b/test/unit/lib/scsi/scsi_pr.c/scsi_pr_ut.c index 27ac8d41c99..99327703605 100644 --- a/test/unit/lib/scsi/scsi_pr.c/scsi_pr_ut.c +++ b/test/unit/lib/scsi/scsi_pr.c/scsi_pr_ut.c @@ -535,6 +535,117 @@ test_reservation_cmds_conflict(void) ut_deinit_reservation_test(); } +static void +test_scsi2_reserve_release(void) +{ + struct spdk_scsi_task task = {0}; + uint8_t cdb[32] = {}; + int rc; + + task.lun = &g_lun; + task.target_port = &g_t_port_0; + task.cdb = cdb; + + ut_init_reservation_test(); + + /* Test Case: SPC2 RESERVE from Host A */ + task.initiator_port = &g_i_port_a; + task.cdb[0] = SPDK_SPC2_RESERVE_10; + rc = scsi2_reserve(&task, task.cdb); + SPDK_CU_ASSERT_FATAL(rc == 0); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.holder != NULL); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.flags == SCSI_SPC2_RESERVE); + + /* Test Case: READ command from Host B */ + task.initiator_port = &g_i_port_b; + task.cdb[0] = SPDK_SBC_READ_10; + task.status = 0; + rc = scsi2_reserve_check(&task); + SPDK_CU_ASSERT_FATAL(rc < 0); + SPDK_CU_ASSERT_FATAL(task.status == SPDK_SCSI_STATUS_RESERVATION_CONFLICT); + + /* Test Case: SPDK_SPC2_RELEASE10 command from Host B */ + task.initiator_port = &g_i_port_b; + task.cdb[0] = SPDK_SPC2_RELEASE_10; + task.status = 0; + rc = scsi2_reserve_check(&task); + SPDK_CU_ASSERT_FATAL(rc == 0); + + rc = scsi2_release(&task); + SPDK_CU_ASSERT_FATAL(rc == 0); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.holder == NULL); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.flags == 0); + + /* Test Case: SPC2 RESERVE from Host B */ + task.initiator_port = &g_i_port_b; + task.cdb[0] = SPDK_SPC2_RESERVE_10; + rc = scsi2_reserve(&task, task.cdb); + SPDK_CU_ASSERT_FATAL(rc == 0); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.holder != NULL); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.flags == SCSI_SPC2_RESERVE); + + /* Test Case: READ command from Host B */ + task.initiator_port = &g_i_port_b; + task.cdb[0] = SPDK_SBC_READ_10; + rc = scsi2_reserve_check(&task); + SPDK_CU_ASSERT_FATAL(rc == 0); + + /* Test Case: SPDK_SPC2_RELEASE10 command from Host A */ + task.initiator_port = &g_i_port_a; + task.cdb[0] = SPDK_SPC2_RELEASE_10; + + rc = scsi2_release(&task); + SPDK_CU_ASSERT_FATAL(rc == 0); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.holder == NULL); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.flags == 0); + + ut_deinit_reservation_test(); +} + +static void +test_pr_with_scsi2_reserve_release(void) +{ + struct spdk_scsi_task task = {0}; + uint8_t cdb[32] = {}; + int rc; + + task.lun = &g_lun; + task.target_port = &g_t_port_0; + task.cdb = cdb; + + ut_init_reservation_test(); + test_build_registrants(); + + task.initiator_port = &g_i_port_a; + task.status = 0; + /* Test Case: Host A acquires the reservation */ + rc = scsi_pr_out_reserve(&task, SPDK_SCSI_PR_WRITE_EXCLUSIVE_REGS_ONLY, + 0xa, 0, 0, 0); + SPDK_CU_ASSERT_FATAL(rc == 0); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.rtype == SPDK_SCSI_PR_WRITE_EXCLUSIVE_REGS_ONLY); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.crkey == 0xa); + + /* Test Case: SPDK_SPC2_RESERVE_10 command from Host B */ + task.initiator_port = &g_i_port_b; + task.cdb[0] = SPDK_SPC2_RESERVE_10; + /* SPC2 RESERVE/RELEASE will pass to scsi2_reserve/release */ + rc = scsi_pr_check(&task); + SPDK_CU_ASSERT_FATAL(rc == 0); + + /* do nothing with PR but have good status */ + rc = scsi2_reserve(&task, task.cdb); + SPDK_CU_ASSERT_FATAL(rc == 0); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.holder != NULL); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.rtype == SPDK_SCSI_PR_WRITE_EXCLUSIVE_REGS_ONLY); + + rc = scsi2_release(&task); + SPDK_CU_ASSERT_FATAL(rc == 0); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.holder != NULL); + SPDK_CU_ASSERT_FATAL(g_lun.reservation.rtype == SPDK_SCSI_PR_WRITE_EXCLUSIVE_REGS_ONLY); + + ut_deinit_reservation_test(); +} + int main(int argc, char **argv) { @@ -550,6 +661,8 @@ main(int argc, char **argv) CU_ADD_TEST(suite, test_reservation_preempt_non_all_regs); CU_ADD_TEST(suite, test_reservation_preempt_all_regs); CU_ADD_TEST(suite, test_reservation_cmds_conflict); + CU_ADD_TEST(suite, test_scsi2_reserve_release); + CU_ADD_TEST(suite, test_pr_with_scsi2_reserve_release); CU_basic_set_mode(CU_BRM_VERBOSE); CU_basic_run_tests(); diff --git a/test/unit/lib/util/cpuset.c/cpuset_ut.c b/test/unit/lib/util/cpuset.c/cpuset_ut.c index 3630c5cbd68..5dd37311268 100644 --- a/test/unit/lib/util/cpuset.c/cpuset_ut.c +++ b/test/unit/lib/util/cpuset.c/cpuset_ut.c @@ -180,6 +180,15 @@ test_cpuset_parse(void) rc = spdk_cpuset_parse(core_mask, "[184467440737095516150]"); CU_ASSERT(rc < 0); + /* Test mask with cores 4-7 and 168-171 set. */ + rc = spdk_cpuset_parse(core_mask, "0xF0000000000000000000000000000000000000000F0"); + CU_ASSERT(rc == 0); + CU_ASSERT(cpuset_check_range(core_mask, 0, 3, false) == 0); + CU_ASSERT(cpuset_check_range(core_mask, 4, 7, true) == 0); + CU_ASSERT(cpuset_check_range(core_mask, 8, 167, false) == 0); + CU_ASSERT(cpuset_check_range(core_mask, 168, 171, true) == 0); + CU_ASSERT(cpuset_check_range(core_mask, 172, SPDK_CPUSET_SIZE - 1, false) == 0); + spdk_cpuset_free(core_mask); } diff --git a/test/unit/unittest.sh b/test/unit/unittest.sh index 955ddb7f91d..39bfdbb4a4f 100755 --- a/test/unit/unittest.sh +++ b/test/unit/unittest.sh @@ -100,6 +100,10 @@ function unittest_scsi() { function unittest_sock() { $valgrind $testdir/lib/sock/sock.c/sock_ut $valgrind $testdir/lib/sock/posix.c/posix_ut + # Check whether uring is configured + if grep -q '#define SPDK_CONFIG_URING 1' $rootdir/include/spdk/config.h; then + $valgrind $testdir/lib/sock/uring.c/uring_ut + fi } function unittest_util() { diff --git a/test/vhost/common.sh b/test/vhost/common.sh index b76ca190dda..33c8e095392 100644 --- a/test/vhost/common.sh +++ b/test/vhost/common.sh @@ -1049,6 +1049,7 @@ function run_fio() { local run_plugin_mode=false local fio_start_cmd local fio_output_format="normal" + local fio_gtod_reduce=false local wait_for_fio=true for arg in "$@"; do @@ -1069,6 +1070,7 @@ function run_fio() { --json) fio_output_format="json" ;; --hide-results) hide_results=true ;; --no-wait-for-fio) wait_for_fio=false ;; + --gtod-reduce) fio_gtod_reduce=true ;; *) error "Invalid argument '$arg'" return 1 @@ -1103,6 +1105,11 @@ function run_fio() { local vmdisks=${vm#*:} sed "s@filename=@filename=$vmdisks@" $job_file | vm_exec $vm_num "cat > /root/$job_fname" + + if $fio_gtod_reduce; then + vm_exec $vm_num "echo 'gtod_reduce=1' >> /root/$job_fname" + fi + vm_exec $vm_num cat /root/$job_fname if $run_server_mode; then @@ -1142,11 +1149,94 @@ function run_fio() { $fio_start_cmd sleep 1 + if [[ "$fio_output_format" == "json" ]]; then + # Fio in client-server mode produces a lot of "trash" output + # preceding JSON structure, making it not possible to parse. + # Remove these lines from file. + # shellcheck disable=SC2005 + echo "$(grep -vP '^[<\w]' "$out/$log_fname")" > "$out/$log_fname" + fi + if [[ ! $hide_results ]]; then cat $out/$log_fname fi } +# Parsing fio results for json output and client-server mode only! +function parse_fio_results() { + local fio_log_dir=$1 + local fio_log_filename=$2 + local fio_csv_filename + + # Variables used in parsing loop + local log_file + local rwmode mixread mixwrite + local lat_key lat_divisor + local client_stats iops bw + local read_avg_lat read_min_lat read_max_lat + local write_avg_lat write_min_lat write_min_lat + + declare -A results + results["iops"]=0 + results["bw"]=0 + results["avg_lat"]=0 + results["min_lat"]=0 + results["max_lat"]=0 + + # Loop using the log filename to see if there are any other + # matching files. This is in case we ran fio test multiple times. + log_files=("$fio_log_dir/$fio_log_filename"*) + for log_file in "${log_files[@]}"; do + rwmode=$(jq -r '.["client_stats"][0]["job options"]["rw"]' "$log_file") + mixread=1 + mixwrite=1 + if [[ $rwmode = *"rw"* ]]; then + mixread=$(jq -r '.["client_stats"][0]["job options"]["rwmixread"]' "$log_file") + mixread=$(bc -l <<< "scale=3; $mixread/100") + mixwrite=$(bc -l <<< "scale=3; 1-$mixread") + fi + + client_stats=$(jq -r '.["client_stats"][] | select(.jobname == "All clients")' "$log_file") + + # Check latency unit and later normalize to microseconds + lat_key="lat_us" + lat_divisor=1 + if jq -er '.read["lat_ns"]' &> /dev/null <<< $client_stats; then + lat_key="lat_ns" + lat_divisor=1000 + fi + + # Horrific bash float point arithmetic oprations below. + # Viewer discretion is advised. + iops=$(jq -r '[.read["iops"],.write["iops"]] | add' <<< $client_stats) + bw=$(jq -r '[.read["bw"],.write["bw"]] | add' <<< $client_stats) + read_avg_lat=$(jq -r --arg lat_key $lat_key '.read[$lat_key]["mean"]' <<< $client_stats) + read_min_lat=$(jq -r --arg lat_key $lat_key '.read[$lat_key]["min"]' <<< $client_stats) + read_max_lat=$(jq -r --arg lat_key $lat_key '.read[$lat_key]["max"]' <<< $client_stats) + write_avg_lat=$(jq -r --arg lat_key $lat_key '.write[$lat_key]["mean"]' <<< $client_stats) + write_min_lat=$(jq -r --arg lat_key $lat_key '.write[$lat_key]["min"]' <<< $client_stats) + write_max_lat=$(jq -r --arg lat_key $lat_key '.write[$lat_key]["max"]' <<< $client_stats) + + results["iops"]=$(bc -l <<< "${results[iops]} + $iops") + results["bw"]=$(bc -l <<< "${results[bw]} + $bw") + results["avg_lat"]=$(bc -l <<< "${results[avg_lat]} + ($mixread*$read_avg_lat + $mixwrite*$write_avg_lat)/$lat_divisor") + results["min_lat"]=$(bc -l <<< "${results[min_lat]} + ($mixread*$read_min_lat + $mixwrite*$write_min_lat)/$lat_divisor") + results["max_lat"]=$(bc -l <<< "${results[max_lat]} + ($mixread*$read_max_lat + $mixwrite*$write_max_lat)/$lat_divisor") + done + + results["iops"]=$(bc -l <<< "scale=3; ${results[iops]} / ${#log_files[@]}") + results["bw"]=$(bc -l <<< "scale=3; ${results[bw]} / ${#log_files[@]}") + results["avg_lat"]=$(bc -l <<< "scale=3; ${results[avg_lat]} / ${#log_files[@]}") + results["min_lat"]=$(bc -l <<< "scale=3; ${results[min_lat]} / ${#log_files[@]}") + results["max_lat"]=$(bc -l <<< "scale=3; ${results[max_lat]} / ${#log_files[@]}") + + fio_csv_filename="${fio_log_filename%%.*}.csv" + cat <<- EOF > "$fio_log_dir/$fio_csv_filename" + iops,bw,avg_lat,min_lat,max_lat + ${results["iops"]},${results["bw"]},${results["avg_lat"]},${results["min_lat"]},${results["max_lat"]} + EOF +} + # Shutdown or kill any running VM and SPDK APP. # function at_app_exit() { diff --git a/test/vhost/perf_bench/vhost_perf.sh b/test/vhost/perf_bench/vhost_perf.sh index 068ab50017b..98c6a8e3c7f 100755 --- a/test/vhost/perf_bench/vhost_perf.sh +++ b/test/vhost/perf_bench/vhost_perf.sh @@ -25,6 +25,7 @@ wwpn_prefix="naa.5001405bc6498" packed_ring=false fio_iterations=1 +fio_gtod="" precond_fio_bin=$CONFIG_FIO_SOURCE_DIR/fio disk_map="" @@ -48,6 +49,7 @@ function usage() { echo " of binary is recommended." echo " --fio-jobs=PATH Comma separated list of fio config files to use for test." echo " --fio-iterations=INT Number of times to run specified workload." + echo " --fio-gtod-reduce Enable fio gtod_reduce option in test." echo " --vm-memory=INT Amount of RAM memory (in MB) to pass to a single VM." echo " Default: 2048 MB" echo " --vm-image=PATH OS image to use for running the VMs." @@ -161,6 +163,7 @@ while getopts 'xh-:' optchar; do fio-bin=*) fio_bin="--fio-bin=${OPTARG#*=}" ;; fio-jobs=*) fio_jobs="${OPTARG#*=}" ;; fio-iterations=*) fio_iterations="${OPTARG#*=}" ;; + fio-gtod-reduce) fio_gtod="--gtod-reduce" ;; vm-memory=*) vm_memory="${OPTARG#*=}" ;; vm-image=*) VM_IMAGE="${OPTARG#*=}" ;; vm-sar-enable) vm_sar_enable=true ;; @@ -411,7 +414,7 @@ for fio_job in ${fio_jobs//,/ }; do fio_log_fname="${fio_job_fname%%.*}.log" for i in $(seq 1 $fio_iterations); do echo "Running FIO iteration $i for $fio_job_fname" - run_fio $fio_bin --hide-results --job-file="$fio_job" --out="$VHOST_DIR/fio_results" --json $fio_disks & + run_fio $fio_bin --hide-results --job-file="$fio_job" --out="$VHOST_DIR/fio_results" --json $fio_disks $fio_gtod & fio_pid=$! if $host_sar_enable || $vm_sar_enable; then @@ -446,6 +449,8 @@ for fio_job in ${fio_jobs//,/ }; do mv $VHOST_DIR/fio_results/$fio_log_fname $VHOST_DIR/fio_results/$fio_log_fname.$i sleep 1 done + + parse_fio_results "$VHOST_DIR/fio_results" "$fio_log_fname" done notice "Shutting down virtual machines..."