--- Makefile.inc | 1 libmultipath/Makefile | 7 libmultipath/hwtable.c | 1 libmultipath/nvme-ioctl.c | 869 ++++++++++++++++++++ libmultipath/nvme-ioctl.h | 139 +++ libmultipath/nvme-lib.c | 49 + libmultipath/nvme-lib.h | 39 libmultipath/nvme/argconfig.h | 99 ++ libmultipath/nvme/json.h | 87 ++ libmultipath/nvme/linux/nvme.h | 1450 +++++++++++++++++++++++++++++++++++ libmultipath/nvme/linux/nvme_ioctl.h | 67 + libmultipath/nvme/nvme.h | 163 +++ libmultipath/nvme/plugin.h | 36 libmultipath/prio.h | 1 libmultipath/prioritizers/Makefile | 4 libmultipath/prioritizers/ana.c | 236 +++++ libmultipath/propsel.c | 10 libmultipath/util.h | 2 multipath/multipath.conf.5 | 3 19 files changed, 3258 insertions(+), 5 deletions(-) Index: multipath-tools-130222/libmultipath/nvme/argconfig.h =================================================================== --- /dev/null +++ multipath-tools-130222/libmultipath/nvme/argconfig.h @@ -0,0 +1,99 @@ +//////////////////////////////////////////////////////////////////////// +// +// Copyright 2014 PMC-Sierra, Inc. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +// +//////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////// +// +// Author: Logan Gunthorpe +// Logan Gunthorpe +// +// Date: Oct 23 2014 +// +// Description: +// Header file for argconfig.c +// +//////////////////////////////////////////////////////////////////////// + +#ifndef argconfig_H +#define argconfig_H + +#include +#include +#include + +enum argconfig_types { + CFG_NONE, + CFG_STRING, + CFG_INT, + CFG_SIZE, + CFG_LONG, + CFG_LONG_SUFFIX, + CFG_DOUBLE, + CFG_BOOL, + CFG_BYTE, + CFG_SHORT, + CFG_POSITIVE, + CFG_INCREMENT, + CFG_SUBOPTS, + CFG_FILE_A, + CFG_FILE_W, + CFG_FILE_R, + CFG_FILE_AP, + CFG_FILE_WP, + CFG_FILE_RP, +}; + +struct argconfig_commandline_options { + const char *option; + const char short_option; + const char *meta; + enum argconfig_types config_type; + void *default_value; + int argument_type; + const char *help; +}; + +#define CFG_MAX_SUBOPTS 500 +#define MAX_HELP_FUNC 20 + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void argconfig_help_func(void); +void argconfig_append_usage(const char *str); +void argconfig_print_help(const char *program_desc, + const struct argconfig_commandline_options *options); +int argconfig_parse(int argc, char *argv[], const char *program_desc, + const struct argconfig_commandline_options *options, + void *config_out, size_t config_size); +int argconfig_parse_subopt_string(char *string, char **options, + size_t max_options); +unsigned argconfig_parse_comma_sep_array(char *string, int *ret, + unsigned max_length); +unsigned argconfig_parse_comma_sep_array_long(char *string, + unsigned long long *ret, + unsigned max_length); +void argconfig_register_help_func(argconfig_help_func * f); + +void print_word_wrapped(const char *s, int indent, int start); +#ifdef __cplusplus +} +#endif +#endif Index: multipath-tools-130222/libmultipath/nvme/json.h =================================================================== --- /dev/null +++ multipath-tools-130222/libmultipath/nvme/json.h @@ -0,0 +1,87 @@ +#ifndef __JSON__H +#define __JSON__H + +struct json_object; +struct json_array; +struct json_pair; + +#define JSON_TYPE_STRING 0 +#define JSON_TYPE_INTEGER 1 +#define JSON_TYPE_FLOAT 2 +#define JSON_TYPE_OBJECT 3 +#define JSON_TYPE_ARRAY 4 +#define JSON_TYPE_UINT 5 +#define JSON_PARENT_TYPE_PAIR 0 +#define JSON_PARENT_TYPE_ARRAY 1 +struct json_value { + int type; + union { + long long integer_number; + unsigned long long uint_number; + long double float_number; + char *string; + struct json_object *object; + struct json_array *array; + }; + int parent_type; + union { + struct json_pair *parent_pair; + struct json_array *parent_array; + }; +}; + +struct json_array { + struct json_value **values; + int value_cnt; + struct json_value *parent; +}; + +struct json_object { + struct json_pair **pairs; + int pair_cnt; + struct json_value *parent; +}; + +struct json_pair { + char *name; + struct json_value *value; + struct json_object *parent; +}; + +struct json_object *json_create_object(void); +struct json_array *json_create_array(void); + +void json_free_object(struct json_object *obj); + +int json_object_add_value_type(struct json_object *obj, const char *name, int type, ...); +#define json_object_add_value_int(obj, name, val) \ + json_object_add_value_type((obj), name, JSON_TYPE_INTEGER, (long long) (val)) +#define json_object_add_value_uint(obj, name, val) \ + json_object_add_value_type((obj), name, JSON_TYPE_UINT, (unsigned long long) (val)) +#define json_object_add_value_float(obj, name, val) \ + json_object_add_value_type((obj), name, JSON_TYPE_FLOAT, (val)) +#define json_object_add_value_string(obj, name, val) \ + json_object_add_value_type((obj), name, JSON_TYPE_STRING, (val)) +#define json_object_add_value_object(obj, name, val) \ + json_object_add_value_type((obj), name, JSON_TYPE_OBJECT, (val)) +#define json_object_add_value_array(obj, name, val) \ + json_object_add_value_type((obj), name, JSON_TYPE_ARRAY, (val)) +int json_array_add_value_type(struct json_array *array, int type, ...); +#define json_array_add_value_int(obj, val) \ + json_array_add_value_type((obj), JSON_TYPE_INTEGER, (val)) +#define json_array_add_value_uint(obj, val) \ + json_array_add_value_type((obj), JSON_TYPE_UINT, (val)) +#define json_array_add_value_float(obj, val) \ + json_array_add_value_type((obj), JSON_TYPE_FLOAT, (val)) +#define json_array_add_value_string(obj, val) \ + json_array_add_value_type((obj), JSON_TYPE_STRING, (val)) +#define json_array_add_value_object(obj, val) \ + json_array_add_value_type((obj), JSON_TYPE_OBJECT, (val)) +#define json_array_add_value_array(obj, val) \ + json_array_add_value_type((obj), JSON_TYPE_ARRAY, (val)) + +#define json_array_last_value_object(obj) \ + (obj->values[obj->value_cnt - 1]->object) + +void json_print_object(struct json_object *obj, void *); +#endif Index: multipath-tools-130222/libmultipath/nvme/nvme.h =================================================================== --- /dev/null +++ multipath-tools-130222/libmultipath/nvme/nvme.h @@ -0,0 +1,163 @@ +/* + * Definitions for the NVM Express interface + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _NVME_H +#define _NVME_H + +#include +#include +#include +#include "plugin.h" +#include "json.h" + +#define unlikely(x) x + +#ifdef LIBUUID +#include +#else +typedef struct { + uint8_t b[16]; +} uuid_t; +#endif + +#include "linux/nvme.h" + +struct nvme_effects_log_page { + __le32 acs[256]; + __le32 iocs[256]; + __u8 resv[2048]; +}; + +struct nvme_error_log_page { + __u64 error_count; + __u16 sqid; + __u16 cmdid; + __u16 status_field; + __u16 parm_error_location; + __u64 lba; + __u32 nsid; + __u8 vs; + __u8 resv[3]; + __u64 cs; + __u8 resv2[24]; +}; + +struct nvme_firmware_log_page { + __u8 afi; + __u8 resv[7]; + __u64 frs[7]; + __u8 resv2[448]; +}; + +/* idle and active power scales occupy the last 2 bits of the field */ +#define POWER_SCALE(s) ((s) >> 6) + +struct nvme_host_mem_buffer { + __u32 hsize; + __u32 hmdlal; + __u32 hmdlau; + __u32 hmdlec; + __u8 rsvd16[4080]; +}; + +struct nvme_auto_pst { + __u32 data; + __u32 rsvd32; +}; + +struct nvme_timestamp { + __u8 timestamp[6]; + __u8 attr; + __u8 rsvd; +}; + +struct nvme_controller_list { + __le16 num; + __le16 identifier[]; +}; + +struct nvme_bar_cap { + __u16 mqes; + __u8 ams_cqr; + __u8 to; + __u16 bps_css_nssrs_dstrd; + __u8 mpsmax_mpsmin; + __u8 reserved; +}; + +#ifdef __CHECKER__ +#define __force __attribute__((force)) +#else +#define __force +#endif + +#define cpu_to_le16(x) \ + ((__force __le16)htole16(x)) +#define cpu_to_le32(x) \ + ((__force __le32)htole32(x)) +#define cpu_to_le64(x) \ + ((__force __le64)htole64(x)) + +#define le16_to_cpu(x) \ + le16toh((__force __u16)(x)) +#define le32_to_cpu(x) \ + le32toh((__force __u32)(x)) +#define le64_to_cpu(x) \ + le64toh((__force __u64)(x)) + +#define MAX_LIST_ITEMS 256 +struct list_item { + char node[1024]; + struct nvme_id_ctrl ctrl; + int nsid; + struct nvme_id_ns ns; + unsigned block; +}; + +struct ctrl_list_item { + char *name; + char *address; + char *transport; + char *state; + char *ana_state; +}; + +struct subsys_list_item { + char *name; + char *subsysnqn; + int nctrls; + struct ctrl_list_item *ctrls; +}; + +enum { + NORMAL, + JSON, + BINARY, +}; + +void register_extension(struct plugin *plugin); + +#include "argconfig.h" +int parse_and_open(int argc, char **argv, const char *desc, + const struct argconfig_commandline_options *clo, void *cfg, size_t size); + +extern const char *devicename; + +int __id_ctrl(int argc, char **argv, struct command *cmd, struct plugin *plugin, void (*vs)(__u8 *vs, struct json_object *root)); +int validate_output_format(char *format); + +struct subsys_list_item *get_subsys_list(int *subcnt, char *subsysnqn, __u32 nsid); +void free_subsys_list(struct subsys_list_item *slist, int n); +char *nvme_char_from_block(char *block); +#endif /* _NVME_H */ Index: multipath-tools-130222/libmultipath/nvme/plugin.h =================================================================== --- /dev/null +++ multipath-tools-130222/libmultipath/nvme/plugin.h @@ -0,0 +1,36 @@ +#ifndef PLUGIN_H +#define PLUGIN_H + +#include + +struct program { + const char *name; + const char *version; + const char *usage; + const char *desc; + const char *more; + struct command **commands; + struct plugin *extensions; +}; + +struct plugin { + const char *name; + const char *desc; + struct command **commands; + struct program *parent; + struct plugin *next; + struct plugin *tail; +}; + +struct command { + char *name; + char *help; + int (*fn)(int argc, char **argv, struct command *command, struct plugin *plugin); + char *alias; +}; + +void usage(struct plugin *plugin); +void general_help(struct plugin *plugin); +int handle_plugin(int argc, char **argv, struct plugin *plugin); + +#endif Index: multipath-tools-130222/libmultipath/nvme/linux/nvme.h =================================================================== --- /dev/null +++ multipath-tools-130222/libmultipath/nvme/linux/nvme.h @@ -0,0 +1,1450 @@ +/* + * Definitions for the NVM Express interface + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_NVME_H +#define _LINUX_NVME_H + +#include +#include + +/* NQN names in commands fields specified one size */ +#define NVMF_NQN_FIELD_LEN 256 + +/* However the max length of a qualified name is another size */ +#define NVMF_NQN_SIZE 223 + +#define NVMF_TRSVCID_SIZE 32 +#define NVMF_TRADDR_SIZE 256 +#define NVMF_TSAS_SIZE 256 + +#define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" + +#define NVME_RDMA_IP_PORT 4420 + +#define NVME_NSID_ALL 0xffffffff + +enum nvme_subsys_type { + NVME_NQN_DISC = 1, /* Discovery type target subsystem */ + NVME_NQN_NVME = 2, /* NVME type target subsystem */ +}; + +/* Address Family codes for Discovery Log Page entry ADRFAM field */ +enum { + NVMF_ADDR_FAMILY_PCI = 0, /* PCIe */ + NVMF_ADDR_FAMILY_IP4 = 1, /* IP4 */ + NVMF_ADDR_FAMILY_IP6 = 2, /* IP6 */ + NVMF_ADDR_FAMILY_IB = 3, /* InfiniBand */ + NVMF_ADDR_FAMILY_FC = 4, /* Fibre Channel */ +}; + +/* Transport Type codes for Discovery Log Page entry TRTYPE field */ +enum { + NVMF_TRTYPE_RDMA = 1, /* RDMA */ + NVMF_TRTYPE_FC = 2, /* Fibre Channel */ + NVMF_TRTYPE_TCP = 3, /* TCP */ + NVMF_TRTYPE_LOOP = 254, /* Reserved for host usage */ + NVMF_TRTYPE_MAX, +}; + +/* Transport Requirements codes for Discovery Log Page entry TREQ field */ +enum { + NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */ + NVMF_TREQ_REQUIRED = 1, /* Required */ + NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */ + NVMF_TREQ_DISABLE_SQFLOW = (1 << 2), /* SQ flow control disable supported */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_QPTYPE_CONNECTED = 1, /* Reliable Connected */ + NVMF_RDMA_QPTYPE_DATAGRAM = 2, /* Reliable Datagram */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 1, /* No Provider Specified */ + NVMF_RDMA_PRTYPE_IB = 2, /* InfiniBand */ + NVMF_RDMA_PRTYPE_ROCE = 3, /* InfiniBand RoCE */ + NVMF_RDMA_PRTYPE_ROCEV2 = 4, /* InfiniBand RoCEV2 */ + NVMF_RDMA_PRTYPE_IWARP = 5, /* IWARP */ +}; + +/* RDMA Connection Management Service Type codes for Discovery Log Page + * entry TSAS RDMA_CMS field + */ +enum { + NVMF_RDMA_CMS_RDMA_CM = 1, /* Sockets based endpoint addressing */ +}; + +/* TCP port security type for Discovery Log Page entry TSAS + */ +enum { + NVMF_TCP_SECTYPE_NONE = 0, /* No Security */ + NVMF_TCP_SECTYPE_TLS = 1, /* Transport Layer Security */ +}; + +#define NVME_AQ_DEPTH 32 +#define NVME_NR_AEN_COMMANDS 1 +#define NVME_AQ_BLK_MQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) + +/* + * Subtract one to leave an empty queue entry for 'Full Queue' condition. See + * NVM-Express 1.2 specification, section 4.1.2. + */ +#define NVME_AQ_MQ_TAG_DEPTH (NVME_AQ_BLK_MQ_DEPTH - 1) + +enum { + NVME_REG_CAP = 0x0000, /* Controller Capabilities */ + NVME_REG_VS = 0x0008, /* Version */ + NVME_REG_INTMS = 0x000c, /* Interrupt Mask Set */ + NVME_REG_INTMC = 0x0010, /* Interrupt Mask Clear */ + NVME_REG_CC = 0x0014, /* Controller Configuration */ + NVME_REG_CSTS = 0x001c, /* Controller Status */ + NVME_REG_NSSR = 0x0020, /* NVM Subsystem Reset */ + NVME_REG_AQA = 0x0024, /* Admin Queue Attributes */ + NVME_REG_ASQ = 0x0028, /* Admin SQ Base Address */ + NVME_REG_ACQ = 0x0030, /* Admin CQ Base Address */ + NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ + NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ + NVME_REG_BPINFO = 0x0040, /* Boot Partition Information */ + NVME_REG_BPRSEL = 0x0044, /* Boot Partition Read Select */ + NVME_REG_BPMBL = 0x0048, /* Boot Partition Memory Buffer Location */ + NVME_REG_DBS = 0x1000, /* SQ 0 Tail Doorbell */ +}; + +#define NVME_CAP_MQES(cap) ((cap) & 0xffff) +#define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) +#define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) +#define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1) +#define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) +#define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf) + +#define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) +#define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) +#define NVME_CMB_SZ(cmbsz) (((cmbsz) >> 12) & 0xfffff) +#define NVME_CMB_SZU(cmbsz) (((cmbsz) >> 8) & 0xf) + +#define NVME_CMB_WDS(cmbsz) ((cmbsz) & 0x10) +#define NVME_CMB_RDS(cmbsz) ((cmbsz) & 0x8) +#define NVME_CMB_LISTS(cmbsz) ((cmbsz) & 0x4) +#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) +#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) + +/* + * Submission and Completion Queue Entry Sizes for the NVM command set. + * (In bytes and specified as a power of two (2^n)). + */ +#define NVME_NVM_IOSQES 6 +#define NVME_NVM_IOCQES 4 + +enum { + NVME_CC_ENABLE = 1 << 0, + NVME_CC_CSS_NVM = 0 << 4, + NVME_CC_EN_SHIFT = 0, + NVME_CC_CSS_SHIFT = 4, + NVME_CC_MPS_SHIFT = 7, + NVME_CC_AMS_SHIFT = 11, + NVME_CC_SHN_SHIFT = 14, + NVME_CC_IOSQES_SHIFT = 16, + NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, + NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, + NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, + NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, + NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, + NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + NVME_CSTS_RDY = 1 << 0, + NVME_CSTS_CFS = 1 << 1, + NVME_CSTS_NSSRO = 1 << 4, + NVME_CSTS_PP = 1 << 5, + NVME_CSTS_SHST_NORMAL = 0 << 2, + NVME_CSTS_SHST_OCCUR = 1 << 2, + NVME_CSTS_SHST_CMPLT = 2 << 2, + NVME_CSTS_SHST_MASK = 3 << 2, +}; + +struct nvme_id_power_state { + __le16 max_power; /* centiwatts */ + __u8 rsvd2; + __u8 flags; + __le32 entry_lat; /* microseconds */ + __le32 exit_lat; /* microseconds */ + __u8 read_tput; + __u8 read_lat; + __u8 write_tput; + __u8 write_lat; + __le16 idle_power; + __u8 idle_scale; + __u8 rsvd19; + __le16 active_power; + __u8 active_work_scale; + __u8 rsvd23[9]; +}; + +enum { + NVME_PS_FLAGS_MAX_POWER_SCALE = 1 << 0, + NVME_PS_FLAGS_NON_OP_STATE = 1 << 1, +}; + +struct nvme_id_ctrl { + __le16 vid; + __le16 ssvid; + char sn[20]; + char mn[40]; + char fr[8]; + __u8 rab; + __u8 ieee[3]; + __u8 cmic; + __u8 mdts; + __le16 cntlid; + __le32 ver; + __le32 rtd3r; + __le32 rtd3e; + __le32 oaes; + __le32 ctratt; + __le16 rrls; + __u8 rsvd102[154]; + __le16 oacs; + __u8 acl; + __u8 aerl; + __u8 frmw; + __u8 lpa; + __u8 elpe; + __u8 npss; + __u8 avscc; + __u8 apsta; + __le16 wctemp; + __le16 cctemp; + __le16 mtfa; + __le32 hmpre; + __le32 hmmin; + __u8 tnvmcap[16]; + __u8 unvmcap[16]; + __le32 rpmbs; + __le16 edstt; + __u8 dsto; + __u8 fwug; + __le16 kas; + __le16 hctma; + __le16 mntmt; + __le16 mxtmt; + __le32 sanicap; + __le32 hmminds; + __le16 hmmaxd; + __le16 nsetidmax; + __u8 rsvd340[2]; + __u8 anatt; + __u8 anacap; + __le32 anagrpmax; + __le32 nanagrpid; + __u8 rsvd352[160]; + __u8 sqes; + __u8 cqes; + __le16 maxcmd; + __le32 nn; + __le16 oncs; + __le16 fuses; + __u8 fna; + __u8 vwc; + __le16 awun; + __le16 awupf; + __u8 nvscc; + __u8 nwpc; + __le16 acwu; + __u8 rsvd534[2]; + __le32 sgls; + __le32 mnan; + __u8 rsvd544[224]; + char subnqn[256]; + __u8 rsvd1024[768]; + __le32 ioccsz; + __le32 iorcsz; + __le16 icdoff; + __u8 ctrattr; + __u8 msdbd; + __u8 rsvd1804[244]; + struct nvme_id_power_state psd[32]; + __u8 vs[1024]; +}; + +enum { + NVME_CTRL_ONCS_COMPARE = 1 << 0, + NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, + NVME_CTRL_ONCS_DSM = 1 << 2, + NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, + NVME_CTRL_ONCS_TIMESTAMP = 1 << 6, + NVME_CTRL_VWC_PRESENT = 1 << 0, + NVME_CTRL_OACS_SEC_SUPP = 1 << 0, + NVME_CTRL_OACS_DIRECTIVES = 1 << 5, + NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8, + NVME_CTRL_LPA_CMD_EFFECTS_LOG = 1 << 1, + NVME_CTRL_CTRATT_128_ID = 1 << 0, + NVME_CTRL_CTRATT_NON_OP_PSP = 1 << 1, + NVME_CTRL_CTRATT_NVM_SETS = 1 << 2, + NVME_CTRL_CTRATT_READ_RECV_LVLS = 1 << 3, + NVME_CTRL_CTRATT_ENDURANCE_GROUPS = 1 << 4, + NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5, +}; + +struct nvme_lbaf { + __le16 ms; + __u8 ds; + __u8 rp; +}; + +struct nvme_id_ns { + __le64 nsze; + __le64 ncap; + __le64 nuse; + __u8 nsfeat; + __u8 nlbaf; + __u8 flbas; + __u8 mc; + __u8 dpc; + __u8 dps; + __u8 nmic; + __u8 rescap; + __u8 fpi; + __u8 dlfeat; + __le16 nawun; + __le16 nawupf; + __le16 nacwu; + __le16 nabsn; + __le16 nabo; + __le16 nabspf; + __le16 noiob; + __u8 nvmcap[16]; + __u8 rsvd64[28]; + __le32 anagrpid; + __u8 rsvd96[3]; + __u8 nsattr; + __le16 nvmsetid; + __le16 endgid; + __u8 nguid[16]; + __u8 eui64[8]; + struct nvme_lbaf lbaf[16]; + __u8 rsvd192[192]; + __u8 vs[3712]; +}; + +enum { + NVME_ID_CNS_NS = 0x00, + NVME_ID_CNS_CTRL = 0x01, + NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, + NVME_ID_CNS_NS_DESC_LIST = 0x03, + NVME_ID_CNS_NVMSET_LIST = 0x04, + NVME_ID_CNS_NS_PRESENT_LIST = 0x10, + NVME_ID_CNS_NS_PRESENT = 0x11, + NVME_ID_CNS_CTRL_NS_LIST = 0x12, + NVME_ID_CNS_CTRL_LIST = 0x13, +}; + +enum { + NVME_DIR_IDENTIFY = 0x00, + NVME_DIR_STREAMS = 0x01, + NVME_DIR_SND_ID_OP_ENABLE = 0x01, + NVME_DIR_SND_ST_OP_REL_ID = 0x01, + NVME_DIR_SND_ST_OP_REL_RSC = 0x02, + NVME_DIR_RCV_ID_OP_PARAM = 0x01, + NVME_DIR_RCV_ST_OP_PARAM = 0x01, + NVME_DIR_RCV_ST_OP_STATUS = 0x02, + NVME_DIR_RCV_ST_OP_RESOURCE = 0x03, + NVME_DIR_ENDIR = 0x01, +}; + +enum { + NVME_NS_FEAT_THIN = 1 << 0, + NVME_NS_FLBAS_LBA_MASK = 0xf, + NVME_NS_FLBAS_META_EXT = 0x10, + NVME_LBAF_RP_BEST = 0, + NVME_LBAF_RP_BETTER = 1, + NVME_LBAF_RP_GOOD = 2, + NVME_LBAF_RP_DEGRADED = 3, + NVME_NS_DPC_PI_LAST = 1 << 4, + NVME_NS_DPC_PI_FIRST = 1 << 3, + NVME_NS_DPC_PI_TYPE3 = 1 << 2, + NVME_NS_DPC_PI_TYPE2 = 1 << 1, + NVME_NS_DPC_PI_TYPE1 = 1 << 0, + NVME_NS_DPS_PI_FIRST = 1 << 3, + NVME_NS_DPS_PI_MASK = 0x7, + NVME_NS_DPS_PI_TYPE1 = 1, + NVME_NS_DPS_PI_TYPE2 = 2, + NVME_NS_DPS_PI_TYPE3 = 3, +}; + +struct nvme_ns_id_desc { + __u8 nidt; + __u8 nidl; + __le16 reserved; +}; + +#define NVME_NIDT_EUI64_LEN 8 +#define NVME_NIDT_NGUID_LEN 16 +#define NVME_NIDT_UUID_LEN 16 + +enum { + NVME_NIDT_EUI64 = 0x01, + NVME_NIDT_NGUID = 0x02, + NVME_NIDT_UUID = 0x03, +}; + +#define NVME_MAX_NVMSET 31 + +struct nvme_nvmset_attr_entry { + __le16 id; + __le16 endurance_group_id; + __u8 rsvd4[4]; + __le32 random_4k_read_typical; + __le32 opt_write_size; + __u8 total_nvmset_cap[16]; + __u8 unalloc_nvmset_cap[16]; + __u8 rsvd48[80]; +}; + +struct nvme_id_nvmset { + __u8 nid; + __u8 rsvd1[127]; + struct nvme_nvmset_attr_entry ent[NVME_MAX_NVMSET]; +}; + +/* Derived from 1.3a Figure 101: Get Log Page – Telemetry Host + * -Initiated Log (Log Identifier 07h) + */ +struct nvme_telemetry_log_page_hdr { + __u8 lpi; /* Log page identifier */ + __u8 rsvd[4]; + __u8 iee_oui[3]; + __u16 dalb1; /* Data area 1 last block */ + __u16 dalb2; /* Data area 2 last block */ + __u16 dalb3; /* Data area 3 last block */ + __u8 rsvd1[368]; /* TODO verify */ + __u8 ctrlavail; /* Controller initiated data avail?*/ + __u8 ctrldgn; /* Controller initiated telemetry Data Gen # */ + __u8 rsnident[128]; + /* We'll have to double fetch so we can get the header, + * parse dalb1->3 determine how much size we need for the + * log then alloc below. Or just do a secondary non-struct + * allocation. + */ + __u8 telemetry_dataarea[0]; +}; + +struct nvme_endurance_group_log { + __u32 rsvd0; + __u8 avl_spare_threshold; + __u8 percent_used; + __u8 rsvd6[26]; + __u8 endurance_estimate[16]; + __u8 data_units_read[16]; + __u8 data_units_written[16]; + __u8 media_units_written[16]; + __u8 rsvd96[416]; +}; + +struct nvme_smart_log { + __u8 critical_warning; + __u8 temperature[2]; + __u8 avail_spare; + __u8 spare_thresh; + __u8 percent_used; + __u8 rsvd6[26]; + __u8 data_units_read[16]; + __u8 data_units_written[16]; + __u8 host_reads[16]; + __u8 host_writes[16]; + __u8 ctrl_busy_time[16]; + __u8 power_cycles[16]; + __u8 power_on_hours[16]; + __u8 unsafe_shutdowns[16]; + __u8 media_errors[16]; + __u8 num_err_log_entries[16]; + __le32 warning_temp_time; + __le32 critical_comp_time; + __le16 temp_sensor[8]; + __le32 thm_temp1_trans_count; + __le32 thm_temp2_trans_count; + __le32 thm_temp1_total_time; + __le32 thm_temp2_total_time; + __u8 rsvd232[280]; +}; + +struct nvme_self_test_res { + __u8 device_self_test_status; + __u8 segment_num; + __u8 valid_diagnostic_info; + __u8 rsvd; + __le64 power_on_hours; + __le32 nsid; + __le64 failing_lba; + __u8 status_code_type; + __u8 status_code; + __u8 vendor_specific[2]; +} __attribute__((packed)); + +struct nvme_self_test_log { + __u8 crnt_dev_selftest_oprn; + __u8 crnt_dev_selftest_compln; + __u8 rsvd[2]; + struct nvme_self_test_res result[20]; +} __attribute__((packed)); + +struct nvme_fw_slot_info_log { + __u8 afi; + __u8 rsvd1[7]; + __le64 frs[7]; + __u8 rsvd64[448]; +}; + +/* NVMe Namespace Write Protect State */ +enum { + NVME_NS_NO_WRITE_PROTECT = 0, + NVME_NS_WRITE_PROTECT, + NVME_NS_WRITE_PROTECT_POWER_CYCLE, + NVME_NS_WRITE_PROTECT_PERMANENT, +}; + +#define NVME_MAX_CHANGED_NAMESPACES 1024 + +struct nvme_changed_ns_list_log { + __le32 log[NVME_MAX_CHANGED_NAMESPACES]; +}; + +enum { + NVME_CMD_EFFECTS_CSUPP = 1 << 0, + NVME_CMD_EFFECTS_LBCC = 1 << 1, + NVME_CMD_EFFECTS_NCC = 1 << 2, + NVME_CMD_EFFECTS_NIC = 1 << 3, + NVME_CMD_EFFECTS_CCC = 1 << 4, + NVME_CMD_EFFECTS_CSE_MASK = 3 << 16, +}; + +struct nvme_effects_log { + __le32 acs[256]; + __le32 iocs[256]; + __u8 resv[2048]; +}; + +enum nvme_ana_state { + NVME_ANA_OPTIMIZED = 0x01, + NVME_ANA_NONOPTIMIZED = 0x02, + NVME_ANA_INACCESSIBLE = 0x03, + NVME_ANA_PERSISTENT_LOSS = 0x04, + NVME_ANA_CHANGE = 0x0f, +}; + +struct nvme_ana_group_desc { + __le32 grpid; + __le32 nnsids; + __le64 chgcnt; + __u8 state; + __u8 rsvd17[15]; + __le32 nsids[]; +}; + +/* flag for the log specific field of the ANA log */ +#define NVME_ANA_LOG_RGO (1 << 0) + +struct nvme_ana_rsp_hdr { + __le64 chgcnt; + __le16 ngrps; + __le16 rsvd10[3]; +}; + +enum { + NVME_SMART_CRIT_SPARE = 1 << 0, + NVME_SMART_CRIT_TEMPERATURE = 1 << 1, + NVME_SMART_CRIT_RELIABILITY = 1 << 2, + NVME_SMART_CRIT_MEDIA = 1 << 3, + NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4, +}; + +enum { + NVME_AER_ERROR = 0, + NVME_AER_SMART = 1, + NVME_AER_CSS = 6, + NVME_AER_VS = 7, + NVME_AER_NOTICE_NS_CHANGED = 0x0002, + NVME_AER_NOTICE_ANA = 0x0003, + NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102, +}; + +struct nvme_lba_range_type { + __u8 type; + __u8 attributes; + __u8 rsvd2[14]; + __u64 slba; + __u64 nlb; + __u8 guid[16]; + __u8 rsvd48[16]; +}; + +enum { + NVME_LBART_TYPE_FS = 0x01, + NVME_LBART_TYPE_RAID = 0x02, + NVME_LBART_TYPE_CACHE = 0x03, + NVME_LBART_TYPE_SWAP = 0x04, + + NVME_LBART_ATTRIB_TEMP = 1 << 0, + NVME_LBART_ATTRIB_HIDE = 1 << 1, +}; + +struct nvme_plm_config { + __u16 enable_event; + __u8 rsvd2[30]; + __u64 dtwin_reads_thresh; + __u64 dtwin_writes_thresh; + __u64 dtwin_time_thresh; + __u8 rsvd56[456]; +}; + +struct nvme_reservation_status { + __le32 gen; + __u8 rtype; + __u8 regctl[2]; + __u8 resv5[2]; + __u8 ptpls; + __u8 resv10[13]; + struct { + __le16 cntlid; + __u8 rcsts; + __u8 resv3[5]; + __le64 hostid; + __le64 rkey; + } regctl_ds[]; +}; + +struct nvme_reservation_status_ext { + __le32 gen; + __u8 rtype; + __u8 regctl[2]; + __u8 resv5[2]; + __u8 ptpls; + __u8 resv10[14]; + __u8 resv24[40]; + struct { + __le16 cntlid; + __u8 rcsts; + __u8 resv3[5]; + __le64 rkey; + __u8 hostid[16]; + __u8 resv32[32]; + } regctl_eds[]; +}; + +enum nvme_async_event_type { + NVME_AER_TYPE_ERROR = 0, + NVME_AER_TYPE_SMART = 1, + NVME_AER_TYPE_NOTICE = 2, +}; + +/* I/O commands */ + +enum nvme_opcode { + nvme_cmd_flush = 0x00, + nvme_cmd_write = 0x01, + nvme_cmd_read = 0x02, + nvme_cmd_write_uncor = 0x04, + nvme_cmd_compare = 0x05, + nvme_cmd_write_zeroes = 0x08, + nvme_cmd_dsm = 0x09, + nvme_cmd_resv_register = 0x0d, + nvme_cmd_resv_report = 0x0e, + nvme_cmd_resv_acquire = 0x11, + nvme_cmd_resv_release = 0x15, +}; + +/* + * Descriptor subtype - lower 4 bits of nvme_(keyed_)sgl_desc identifier + * + * @NVME_SGL_FMT_ADDRESS: absolute address of the data block + * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block + * @NVME_SGL_FMT_TRANSPORT_A: transport defined format, value 0xA + * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation + * request subtype + */ +enum { + NVME_SGL_FMT_ADDRESS = 0x00, + NVME_SGL_FMT_OFFSET = 0x01, + NVME_SGL_FMT_TRANSPORT_A = 0x0A, + NVME_SGL_FMT_INVALIDATE = 0x0f, +}; + +/* + * Descriptor type - upper 4 bits of nvme_(keyed_)sgl_desc identifier + * + * For struct nvme_sgl_desc: + * @NVME_SGL_FMT_DATA_DESC: data block descriptor + * @NVME_SGL_FMT_SEG_DESC: sgl segment descriptor + * @NVME_SGL_FMT_LAST_SEG_DESC: last sgl segment descriptor + * + * For struct nvme_keyed_sgl_desc: + * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor + * + * Transport-specific SGL types: + * @NVME_TRANSPORT_SGL_DATA_DESC: Transport SGL data dlock descriptor + */ +enum { + NVME_SGL_FMT_DATA_DESC = 0x00, + NVME_SGL_FMT_SEG_DESC = 0x02, + NVME_SGL_FMT_LAST_SEG_DESC = 0x03, + NVME_KEY_SGL_FMT_DATA_DESC = 0x04, + NVME_TRANSPORT_SGL_DATA_DESC = 0x05, +}; + +struct nvme_sgl_desc { + __le64 addr; + __le32 length; + __u8 rsvd[3]; + __u8 type; +}; + +struct nvme_keyed_sgl_desc { + __le64 addr; + __u8 length[3]; + __u8 key[4]; + __u8 type; +}; + +union nvme_data_ptr { + struct { + __le64 prp1; + __le64 prp2; + }; + struct nvme_sgl_desc sgl; + struct nvme_keyed_sgl_desc ksgl; +}; + +/* + * Lowest two bits of our flags field (FUSE field in the spec): + * + * @NVME_CMD_FUSE_FIRST: Fused Operation, first command + * @NVME_CMD_FUSE_SECOND: Fused Operation, second command + * + * Highest two bits in our flags field (PSDT field in the spec): + * + * @NVME_CMD_PSDT_SGL_METABUF: Use SGLS for this transfer, + * If used, MPTR contains addr of single physical buffer (byte aligned). + * @NVME_CMD_PSDT_SGL_METASEG: Use SGLS for this transfer, + * If used, MPTR contains an address of an SGL segment containing + * exactly 1 SGL descriptor (qword aligned). + */ +enum { + NVME_CMD_FUSE_FIRST = (1 << 0), + NVME_CMD_FUSE_SECOND = (1 << 1), + + NVME_CMD_SGL_METABUF = (1 << 6), + NVME_CMD_SGL_METASEG = (1 << 7), + NVME_CMD_SGL_ALL = NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG, +}; + +struct nvme_common_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le32 cdw2[2]; + __le64 metadata; + union nvme_data_ptr dptr; + __le32 cdw10[6]; +}; + +struct nvme_rw_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + union nvme_data_ptr dptr; + __le64 slba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le32 reftag; + __le16 apptag; + __le16 appmask; +}; + +enum { + NVME_RW_LR = 1 << 15, + NVME_RW_FUA = 1 << 14, + NVME_RW_DEAC = 1 << 9, + NVME_RW_DSM_FREQ_UNSPEC = 0, + NVME_RW_DSM_FREQ_TYPICAL = 1, + NVME_RW_DSM_FREQ_RARE = 2, + NVME_RW_DSM_FREQ_READS = 3, + NVME_RW_DSM_FREQ_WRITES = 4, + NVME_RW_DSM_FREQ_RW = 5, + NVME_RW_DSM_FREQ_ONCE = 6, + NVME_RW_DSM_FREQ_PREFETCH = 7, + NVME_RW_DSM_FREQ_TEMP = 8, + NVME_RW_DSM_LATENCY_NONE = 0 << 4, + NVME_RW_DSM_LATENCY_IDLE = 1 << 4, + NVME_RW_DSM_LATENCY_NORM = 2 << 4, + NVME_RW_DSM_LATENCY_LOW = 3 << 4, + NVME_RW_DSM_SEQ_REQ = 1 << 6, + NVME_RW_DSM_COMPRESSED = 1 << 7, + NVME_RW_PRINFO_PRCHK_REF = 1 << 10, + NVME_RW_PRINFO_PRCHK_APP = 1 << 11, + NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, + NVME_RW_PRINFO_PRACT = 1 << 13, + NVME_RW_DTYPE_STREAMS = 1 << 4, +}; + +struct nvme_dsm_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 nr; + __le32 attributes; + __u32 rsvd12[4]; +}; + +enum { + NVME_DSMGMT_IDR = 1 << 0, + NVME_DSMGMT_IDW = 1 << 1, + NVME_DSMGMT_AD = 1 << 2, +}; + +#define NVME_DSM_MAX_RANGES 256 + +struct nvme_dsm_range { + __le32 cattr; + __le32 nlb; + __le64 slba; +}; + +struct nvme_write_zeroes_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + union nvme_data_ptr dptr; + __le64 slba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le32 reftag; + __le16 apptag; + __le16 appmask; +}; + +/* Features */ + +struct nvme_feat_auto_pst { + __le64 entries[32]; +}; + +enum { + NVME_HOST_MEM_ENABLE = (1 << 0), + NVME_HOST_MEM_RETURN = (1 << 1), +}; + +/* Admin commands */ + +enum nvme_admin_opcode { + nvme_admin_delete_sq = 0x00, + nvme_admin_create_sq = 0x01, + nvme_admin_get_log_page = 0x02, + nvme_admin_delete_cq = 0x04, + nvme_admin_create_cq = 0x05, + nvme_admin_identify = 0x06, + nvme_admin_abort_cmd = 0x08, + nvme_admin_set_features = 0x09, + nvme_admin_get_features = 0x0a, + nvme_admin_async_event = 0x0c, + nvme_admin_ns_mgmt = 0x0d, + nvme_admin_activate_fw = 0x10, + nvme_admin_download_fw = 0x11, + nvme_admin_dev_self_test = 0x14, + nvme_admin_ns_attach = 0x15, + nvme_admin_keep_alive = 0x18, + nvme_admin_directive_send = 0x19, + nvme_admin_directive_recv = 0x1a, + nvme_admin_virtual_mgmt = 0x1c, + nvme_admin_nvme_mi_send = 0x1d, + nvme_admin_nvme_mi_recv = 0x1e, + nvme_admin_dbbuf = 0x7C, + nvme_admin_format_nvm = 0x80, + nvme_admin_security_send = 0x81, + nvme_admin_security_recv = 0x82, + nvme_admin_sanitize_nvm = 0x84, +}; + +enum { + NVME_QUEUE_PHYS_CONTIG = (1 << 0), + NVME_CQ_IRQ_ENABLED = (1 << 1), + NVME_SQ_PRIO_URGENT = (0 << 1), + NVME_SQ_PRIO_HIGH = (1 << 1), + NVME_SQ_PRIO_MEDIUM = (2 << 1), + NVME_SQ_PRIO_LOW = (3 << 1), + NVME_FEAT_ARBITRATION = 0x01, + NVME_FEAT_POWER_MGMT = 0x02, + NVME_FEAT_LBA_RANGE = 0x03, + NVME_FEAT_TEMP_THRESH = 0x04, + NVME_FEAT_ERR_RECOVERY = 0x05, + NVME_FEAT_VOLATILE_WC = 0x06, + NVME_FEAT_NUM_QUEUES = 0x07, + NVME_FEAT_IRQ_COALESCE = 0x08, + NVME_FEAT_IRQ_CONFIG = 0x09, + NVME_FEAT_WRITE_ATOMIC = 0x0a, + NVME_FEAT_ASYNC_EVENT = 0x0b, + NVME_FEAT_AUTO_PST = 0x0c, + NVME_FEAT_HOST_MEM_BUF = 0x0d, + NVME_FEAT_TIMESTAMP = 0x0e, + NVME_FEAT_KATO = 0x0f, + NVME_FEAT_HCTM = 0X10, + NVME_FEAT_NOPSC = 0X11, + NVME_FEAT_RRL = 0x12, + NVME_FEAT_PLM_CONFIG = 0x13, + NVME_FEAT_PLM_WINDOW = 0x14, + NVME_FEAT_SW_PROGRESS = 0x80, + NVME_FEAT_HOST_ID = 0x81, + NVME_FEAT_RESV_MASK = 0x82, + NVME_FEAT_RESV_PERSIST = 0x83, + NVME_FEAT_WRITE_PROTECT = 0x84, + NVME_LOG_ERROR = 0x01, + NVME_LOG_SMART = 0x02, + NVME_LOG_FW_SLOT = 0x03, + NVME_LOG_CHANGED_NS = 0x04, + NVME_LOG_CMD_EFFECTS = 0x05, + NVME_LOG_DEVICE_SELF_TEST = 0x06, + NVME_LOG_TELEMETRY_HOST = 0x07, + NVME_LOG_TELEMETRY_CTRL = 0x08, + NVME_LOG_ENDURANCE_GROUP = 0x09, + NVME_LOG_ANA = 0x0c, + NVME_LOG_DISC = 0x70, + NVME_LOG_RESERVATION = 0x80, + NVME_LOG_SANITIZE = 0x81, + NVME_FWACT_REPL = (0 << 3), + NVME_FWACT_REPL_ACTV = (1 << 3), + NVME_FWACT_ACTV = (2 << 3), +}; + +enum { + NVME_NO_LOG_LSP = 0x0, + NVME_NO_LOG_LPO = 0x0, + NVME_LOG_ANA_LSP_RGO = 0x1, + NVME_TELEM_LSP_CREATE = 0x1, +}; + +/* Sanitize and Sanitize Monitor/Log */ +enum { + /* Sanitize */ + NVME_SANITIZE_NO_DEALLOC = 0x00000200, + NVME_SANITIZE_OIPBP = 0x00000100, + NVME_SANITIZE_OWPASS_SHIFT = 0x00000004, + NVME_SANITIZE_AUSE = 0x00000008, + NVME_SANITIZE_ACT_CRYPTO_ERASE = 0x00000004, + NVME_SANITIZE_ACT_OVERWRITE = 0x00000003, + NVME_SANITIZE_ACT_BLOCK_ERASE = 0x00000002, + NVME_SANITIZE_ACT_EXIT = 0x00000001, + + /* Sanitize Monitor/Log */ + NVME_SANITIZE_LOG_DATA_LEN = 0x0014, + NVME_SANITIZE_LOG_GLOBAL_DATA_ERASED = 0x0100, + NVME_SANITIZE_LOG_NUM_CMPLTED_PASS_MASK = 0x00F8, + NVME_SANITIZE_LOG_STATUS_MASK = 0x0007, + NVME_SANITIZE_LOG_NEVER_SANITIZED = 0x0000, + NVME_SANITIZE_LOG_COMPLETED_SUCCESS = 0x0001, + NVME_SANITIZE_LOG_IN_PROGESS = 0x0002, + NVME_SANITIZE_LOG_COMPLETED_FAILED = 0x0003, +}; + +enum { + /* Self-test log Validation bits */ + NVME_SELF_TEST_VALID_NSID = 1 << 0, + NVME_SELF_TEST_VALID_FLBA = 1 << 1, + NVME_SELF_TEST_VALID_SCT = 1 << 2, + NVME_SELF_TEST_VALID_SC = 1 << 3, + NVME_SELF_TEST_REPORTS = 20, +}; + +struct nvme_identify { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __u8 cns; + __u8 rsvd3; + __le16 ctrlid; + __u32 rsvd11[5]; +}; + +#define NVME_IDENTIFY_DATA_SIZE 4096 + +struct nvme_features { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 fid; + __le32 dword11; + __le32 dword12; + __le32 dword13; + __le32 dword14; + __le32 dword15; +}; + +struct nvme_host_mem_buf_desc { + __le64 addr; + __le32 size; + __u32 rsvd; +}; + +struct nvme_create_cq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 cqid; + __le16 qsize; + __le16 cq_flags; + __le16 irq_vector; + __u32 rsvd12[4]; +}; + +struct nvme_create_sq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 sqid; + __le16 qsize; + __le16 sq_flags; + __le16 cqid; + __u32 rsvd12[4]; +}; + +struct nvme_delete_queue { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[9]; + __le16 qid; + __u16 rsvd10; + __u32 rsvd11[5]; +}; + +struct nvme_abort_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[9]; + __le16 sqid; + __u16 cid; + __u32 rsvd11[5]; +}; + +struct nvme_download_firmware { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + union nvme_data_ptr dptr; + __le32 numd; + __le32 offset; + __u32 rsvd12[4]; +}; + +struct nvme_format_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[4]; + __le32 cdw10; + __u32 rsvd11[5]; +}; + +struct nvme_get_log_page_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __u8 lid; + __u8 lsp; + __le16 numdl; + __le16 numdu; + __u16 rsvd11; + __le32 lpol; + __le32 lpou; + __u32 rsvd14[2]; +}; + +struct nvme_directive_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 numd; + __u8 doper; + __u8 dtype; + __le16 dspec; + __u8 endir; + __u8 tdtype; + __u16 rsvd15; + + __u32 rsvd16[3]; +}; + +/* Sanitize Log Page */ +struct nvme_sanitize_log_page { + __le16 progress; + __le16 status; + __le32 cdw10_info; + __le32 est_ovrwrt_time; + __le32 est_blk_erase_time; + __le32 est_crypto_erase_time; +}; + +/* + * Fabrics subcommands. + */ +enum nvmf_fabrics_opcode { + nvme_fabrics_command = 0x7f, +}; + +enum nvmf_capsule_command { + nvme_fabrics_type_property_set = 0x00, + nvme_fabrics_type_connect = 0x01, + nvme_fabrics_type_property_get = 0x04, +}; + +struct nvmf_common_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 ts[24]; +}; + +/* + * The legal cntlid range a NVMe Target will provide. + * Note that cntlid of value 0 is considered illegal in the fabrics world. + * Devices based on earlier specs did not have the subsystem concept; + * therefore, those devices had their cntlid value set to 0 as a result. + */ +#define NVME_CNTLID_MIN 1 +#define NVME_CNTLID_MAX 0xffef +#define NVME_CNTLID_DYNAMIC 0xffff + +#define MAX_DISC_LOGS 255 + +/* Discovery log page entry */ +struct nvmf_disc_rsp_page_entry { + __u8 trtype; + __u8 adrfam; + __u8 subtype; + __u8 treq; + __le16 portid; + __le16 cntlid; + __le16 asqsz; + __u8 resv8[22]; + char trsvcid[NVMF_TRSVCID_SIZE]; + __u8 resv64[192]; + char subnqn[NVMF_NQN_FIELD_LEN]; + char traddr[NVMF_TRADDR_SIZE]; + union tsas { + char common[NVMF_TSAS_SIZE]; + struct rdma { + __u8 qptype; + __u8 prtype; + __u8 cms; + __u8 resv3[5]; + __u16 pkey; + __u8 resv10[246]; + } rdma; + struct tcp { + __u8 sectype; + } tcp; + } tsas; +}; + +/* Discovery log page header */ +struct nvmf_disc_rsp_page_hdr { + __le64 genctr; + __le64 numrec; + __le16 recfmt; + __u8 resv14[1006]; + struct nvmf_disc_rsp_page_entry entries[0]; +}; + +struct nvmf_connect_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[19]; + union nvme_data_ptr dptr; + __le16 recfmt; + __le16 qid; + __le16 sqsize; + __u8 cattr; + __u8 resv3; + __le32 kato; + __u8 resv4[12]; +}; + +struct nvmf_connect_data { + uuid_t hostid; + __le16 cntlid; + char resv4[238]; + char subsysnqn[NVMF_NQN_FIELD_LEN]; + char hostnqn[NVMF_NQN_FIELD_LEN]; + char resv5[256]; +}; + +struct nvmf_property_set_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __le64 value; + __u8 resv4[8]; +}; + +struct nvmf_property_get_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __u8 resv4[16]; +}; + +struct nvme_dbbuf { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __le64 prp2; + __u32 rsvd12[6]; +}; + +struct streams_directive_params { + __le16 msl; + __le16 nssa; + __le16 nsso; + __u8 rsvd[10]; + __le32 sws; + __le16 sgs; + __le16 nsa; + __le16 nso; + __u8 rsvd2[6]; +}; + +struct nvme_command { + union { + struct nvme_common_command common; + struct nvme_rw_command rw; + struct nvme_identify identify; + struct nvme_features features; + struct nvme_create_cq create_cq; + struct nvme_create_sq create_sq; + struct nvme_delete_queue delete_queue; + struct nvme_download_firmware dlfw; + struct nvme_format_cmd format; + struct nvme_dsm_cmd dsm; + struct nvme_write_zeroes_cmd write_zeroes; + struct nvme_abort_cmd abort; + struct nvme_get_log_page_command get_log_page; + struct nvmf_common_command fabrics; + struct nvmf_connect_command connect; + struct nvmf_property_set_command prop_set; + struct nvmf_property_get_command prop_get; + struct nvme_dbbuf dbbuf; + struct nvme_directive_cmd directive; + }; +}; + +static inline bool nvme_is_write(struct nvme_command *cmd) +{ + /* + * What a mess... + * + * Why can't we simply have a Fabrics In and Fabrics out command? + */ + if (unlikely(cmd->common.opcode == nvme_fabrics_command)) + return cmd->fabrics.fctype & 1; + return cmd->common.opcode & 1; +} + +enum { + /* + * Generic Command Status: + */ + NVME_SC_SUCCESS = 0x0, + NVME_SC_INVALID_OPCODE = 0x1, + NVME_SC_INVALID_FIELD = 0x2, + NVME_SC_CMDID_CONFLICT = 0x3, + NVME_SC_DATA_XFER_ERROR = 0x4, + NVME_SC_POWER_LOSS = 0x5, + NVME_SC_INTERNAL = 0x6, + NVME_SC_ABORT_REQ = 0x7, + NVME_SC_ABORT_QUEUE = 0x8, + NVME_SC_FUSED_FAIL = 0x9, + NVME_SC_FUSED_MISSING = 0xa, + NVME_SC_INVALID_NS = 0xb, + NVME_SC_CMD_SEQ_ERROR = 0xc, + NVME_SC_SGL_INVALID_LAST = 0xd, + NVME_SC_SGL_INVALID_COUNT = 0xe, + NVME_SC_SGL_INVALID_DATA = 0xf, + NVME_SC_SGL_INVALID_METADATA = 0x10, + NVME_SC_SGL_INVALID_TYPE = 0x11, + + NVME_SC_SGL_INVALID_OFFSET = 0x16, + NVME_SC_SGL_INVALID_SUBTYPE = 0x17, + + NVME_SC_SANITIZE_FAILED = 0x1C, + NVME_SC_SANITIZE_IN_PROGRESS = 0x1D, + + NVME_SC_NS_WRITE_PROTECTED = 0x20, + + NVME_SC_LBA_RANGE = 0x80, + NVME_SC_CAP_EXCEEDED = 0x81, + NVME_SC_NS_NOT_READY = 0x82, + NVME_SC_RESERVATION_CONFLICT = 0x83, + + /* + * Command Specific Status: + */ + NVME_SC_CQ_INVALID = 0x100, + NVME_SC_QID_INVALID = 0x101, + NVME_SC_QUEUE_SIZE = 0x102, + NVME_SC_ABORT_LIMIT = 0x103, + NVME_SC_ABORT_MISSING = 0x104, + NVME_SC_ASYNC_LIMIT = 0x105, + NVME_SC_FIRMWARE_SLOT = 0x106, + NVME_SC_FIRMWARE_IMAGE = 0x107, + NVME_SC_INVALID_VECTOR = 0x108, + NVME_SC_INVALID_LOG_PAGE = 0x109, + NVME_SC_INVALID_FORMAT = 0x10a, + NVME_SC_FW_NEEDS_CONV_RESET = 0x10b, + NVME_SC_INVALID_QUEUE = 0x10c, + NVME_SC_FEATURE_NOT_SAVEABLE = 0x10d, + NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e, + NVME_SC_FEATURE_NOT_PER_NS = 0x10f, + NVME_SC_FW_NEEDS_SUBSYS_RESET = 0x110, + NVME_SC_FW_NEEDS_RESET = 0x111, + NVME_SC_FW_NEEDS_MAX_TIME = 0x112, + NVME_SC_FW_ACIVATE_PROHIBITED = 0x113, + NVME_SC_OVERLAPPING_RANGE = 0x114, + NVME_SC_NS_INSUFFICENT_CAP = 0x115, + NVME_SC_NS_ID_UNAVAILABLE = 0x116, + NVME_SC_NS_ALREADY_ATTACHED = 0x118, + NVME_SC_NS_IS_PRIVATE = 0x119, + NVME_SC_NS_NOT_ATTACHED = 0x11a, + NVME_SC_THIN_PROV_NOT_SUPP = 0x11b, + NVME_SC_CTRL_LIST_INVALID = 0x11c, + NVME_SC_BP_WRITE_PROHIBITED = 0x11e, + + /* + * I/O Command Set Specific - NVM commands: + */ + NVME_SC_BAD_ATTRIBUTES = 0x180, + NVME_SC_INVALID_PI = 0x181, + NVME_SC_READ_ONLY = 0x182, + NVME_SC_ONCS_NOT_SUPPORTED = 0x183, + + /* + * I/O Command Set Specific - Fabrics commands: + */ + NVME_SC_CONNECT_FORMAT = 0x180, + NVME_SC_CONNECT_CTRL_BUSY = 0x181, + NVME_SC_CONNECT_INVALID_PARAM = 0x182, + NVME_SC_CONNECT_RESTART_DISC = 0x183, + NVME_SC_CONNECT_INVALID_HOST = 0x184, + + NVME_SC_DISCOVERY_RESTART = 0x190, + NVME_SC_AUTH_REQUIRED = 0x191, + + /* + * Media and Data Integrity Errors: + */ + NVME_SC_WRITE_FAULT = 0x280, + NVME_SC_READ_ERROR = 0x281, + NVME_SC_GUARD_CHECK = 0x282, + NVME_SC_APPTAG_CHECK = 0x283, + NVME_SC_REFTAG_CHECK = 0x284, + NVME_SC_COMPARE_FAILED = 0x285, + NVME_SC_ACCESS_DENIED = 0x286, + NVME_SC_UNWRITTEN_BLOCK = 0x287, + + /* + * Path-related Errors: + */ + NVME_SC_ANA_PERSISTENT_LOSS = 0x301, + NVME_SC_ANA_INACCESSIBLE = 0x302, + NVME_SC_ANA_TRANSITION = 0x303, + + NVME_SC_DNR = 0x4000, +}; + +struct nvme_completion { + /* + * Used by Admin and Fabrics commands to return data: + */ + union nvme_result { + __le16 u16; + __le32 u32; + __le64 u64; + } result; + __le16 sq_head; /* how much of this queue may be reclaimed */ + __le16 sq_id; /* submission queue that generated this entry */ + __u16 command_id; /* of the command which completed */ + __le16 status; /* did the command fail, and if so, why? */ +}; + +#define NVME_VS(major, minor, tertiary) \ + (((major) << 16) | ((minor) << 8) | (tertiary)) + +#define NVME_MAJOR(ver) ((ver) >> 16) +#define NVME_MINOR(ver) (((ver) >> 8) & 0xff) +#define NVME_TERTIARY(ver) ((ver) & 0xff) + +#endif /* _LINUX_NVME_H */ Index: multipath-tools-130222/libmultipath/nvme/linux/nvme_ioctl.h =================================================================== --- /dev/null +++ multipath-tools-130222/libmultipath/nvme/linux/nvme_ioctl.h @@ -0,0 +1,67 @@ +/* + * Definitions for the NVM Express ioctl interface + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _UAPI_LINUX_NVME_IOCTL_H +#define _UAPI_LINUX_NVME_IOCTL_H + +#include +#include + +struct nvme_user_io { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +}; + +struct nvme_passthru_cmd { + __u8 opcode; + __u8 flags; + __u16 rsvd1; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u32 cdw12; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u32 timeout_ms; + __u32 result; +}; + +#define nvme_admin_cmd nvme_passthru_cmd + +#define NVME_IOCTL_ID _IO('N', 0x40) +#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) +#define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) +#define NVME_IOCTL_IO_CMD _IOWR('N', 0x43, struct nvme_passthru_cmd) +#define NVME_IOCTL_RESET _IO('N', 0x44) +#define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) +#define NVME_IOCTL_RESCAN _IO('N', 0x46) + +#endif /* _UAPI_LINUX_NVME_IOCTL_H */ Index: multipath-tools-130222/Makefile.inc =================================================================== --- multipath-tools-130222.orig/Makefile.inc +++ multipath-tools-130222/Makefile.inc @@ -37,6 +37,7 @@ mpathpersistdir = $(TOPDIR)/libmpathpers includedir = $(prefix)/usr/include mpathcmddir = $(TOPDIR)/libmpathcmd libdmmpdir = $(TOPDIR)/libdmmp +nvmedir = $(TOPDIR)/libmultipath/nvme pkgconfdir = $(prefix)/usr/$(LIB)/pkgconfig GZIP = /bin/gzip -9 -c Index: multipath-tools-130222/libmultipath/Makefile =================================================================== --- multipath-tools-130222.orig/libmultipath/Makefile +++ multipath-tools-130222/libmultipath/Makefile @@ -8,7 +8,7 @@ SONAME=0 DEVLIB = libmultipath.so LIBS = $(DEVLIB).$(SONAME) LIBDEPS = -lpthread -ldl -ldevmapper -ludev -L$(mpathcmddir) -lmpathcmd -laio -CFLAGS += -fPIC -I$(mpathcmddir) -I$(mpathpersistdir) +CFLAGS += -fPIC -I$(mpathcmddir) -I$(mpathpersistdir) -I$(nvmedir) OBJS = memory.o parser.o vector.o devmapper.o \ hwtable.o blacklist.o util.o dmparser.o config.o \ @@ -17,7 +17,7 @@ OBJS = memory.o parser.o vector.o devmap switchgroup.o uxsock.o print.o alias.o log_pthread.o \ log.o configure.o structs_vec.o sysfs.o prio.o checkers.o \ lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o prkey.o \ - io_err_stat.o + io_err_stat.o nvme-lib.o LIBDM_API_FLUSH = $(shell grep -Ecs '^[a-z]*[[:space:]]+dm_task_no_flush' /usr/include/libdevmapper.h) @@ -46,6 +46,9 @@ endif all: $(LIBS) +nvme-lib.o: nvme-lib.c nvme-ioctl.c nvme-ioctl.h + $(CC) $(CFLAGS) -Wno-unused-function -c -o $@ $< + $(LIBS): $(OBJS) $(CC) $(LDFLAGS) $(SHARED_FLAGS) -Wl,-soname=$@ $(CFLAGS) -o $@ $(OBJS) $(LIBDEPS) ln -sf $@ $(DEVLIB) Index: multipath-tools-130222/libmultipath/nvme-ioctl.c =================================================================== --- /dev/null +++ multipath-tools-130222/libmultipath/nvme-ioctl.c @@ -0,0 +1,869 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvme-ioctl.h" + +static int nvme_verify_chr(int fd) +{ + static struct stat nvme_stat; + int err = fstat(fd, &nvme_stat); + + if (err < 0) { + perror("fstat"); + return errno; + } + if (!S_ISCHR(nvme_stat.st_mode)) { + fprintf(stderr, + "Error: requesting reset on non-controller handle\n"); + return ENOTBLK; + } + return 0; +} + +static int nvme_subsystem_reset(int fd) +{ + int ret; + + ret = nvme_verify_chr(fd); + if (ret) + return ret; + return ioctl(fd, NVME_IOCTL_SUBSYS_RESET); +} + +static int nvme_reset_controller(int fd) +{ + int ret; + + ret = nvme_verify_chr(fd); + if (ret) + return ret; + return ioctl(fd, NVME_IOCTL_RESET); +} + +static int nvme_ns_rescan(int fd) +{ + int ret; + + ret = nvme_verify_chr(fd); + if (ret) + return ret; + return ioctl(fd, NVME_IOCTL_RESCAN); +} + +static int nvme_get_nsid(int fd) +{ + static struct stat nvme_stat; + int err = fstat(fd, &nvme_stat); + + if (err < 0) + return -errno; + + if (!S_ISBLK(nvme_stat.st_mode)) { + fprintf(stderr, + "Error: requesting namespace-id from non-block device\n"); + errno = ENOTBLK; + return -errno; + } + return ioctl(fd, NVME_IOCTL_ID); +} + +static int nvme_submit_passthru(int fd, unsigned long ioctl_cmd, + struct nvme_passthru_cmd *cmd) +{ + return ioctl(fd, ioctl_cmd, cmd); +} + +static int nvme_submit_admin_passthru(int fd, struct nvme_passthru_cmd *cmd) +{ + return ioctl(fd, NVME_IOCTL_ADMIN_CMD, cmd); +} + +static int nvme_submit_io_passthru(int fd, struct nvme_passthru_cmd *cmd) +{ + return ioctl(fd, NVME_IOCTL_IO_CMD, cmd); +} + +static int nvme_passthru(int fd, unsigned long ioctl_cmd, __u8 opcode, + __u8 flags, __u16 rsvd, + __u32 nsid, __u32 cdw2, __u32 cdw3, __u32 cdw10, __u32 cdw11, + __u32 cdw12, __u32 cdw13, __u32 cdw14, __u32 cdw15, + __u32 data_len, void *data, __u32 metadata_len, + void *metadata, __u32 timeout_ms, __u32 *result) +{ + struct nvme_passthru_cmd cmd = { + .opcode = opcode, + .flags = flags, + .rsvd1 = rsvd, + .nsid = nsid, + .cdw2 = cdw2, + .cdw3 = cdw3, + .metadata = (__u64)(uintptr_t) metadata, + .addr = (__u64)(uintptr_t) data, + .metadata_len = metadata_len, + .data_len = data_len, + .cdw10 = cdw10, + .cdw11 = cdw11, + .cdw12 = cdw12, + .cdw13 = cdw13, + .cdw14 = cdw14, + .cdw15 = cdw15, + .timeout_ms = timeout_ms, + .result = 0, + }; + int err; + + err = nvme_submit_passthru(fd, ioctl_cmd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +static int nvme_io(int fd, __u8 opcode, __u64 slba, __u16 nblocks, __u16 control, + __u32 dsmgmt, __u32 reftag, __u16 apptag, __u16 appmask, void *data, + void *metadata) +{ + struct nvme_user_io io = { + .opcode = opcode, + .flags = 0, + .control = control, + .nblocks = nblocks, + .rsvd = 0, + .metadata = (__u64)(uintptr_t) metadata, + .addr = (__u64)(uintptr_t) data, + .slba = slba, + .dsmgmt = dsmgmt, + .reftag = reftag, + .appmask = appmask, + .apptag = apptag, + }; + return ioctl(fd, NVME_IOCTL_SUBMIT_IO, &io); +} + +static int nvme_read(int fd, __u64 slba, __u16 nblocks, __u16 control, __u32 dsmgmt, + __u32 reftag, __u16 apptag, __u16 appmask, void *data, + void *metadata) +{ + return nvme_io(fd, nvme_cmd_read, slba, nblocks, control, dsmgmt, + reftag, apptag, appmask, data, metadata); +} + +static int nvme_write(int fd, __u64 slba, __u16 nblocks, __u16 control, __u32 dsmgmt, + __u32 reftag, __u16 apptag, __u16 appmask, void *data, + void *metadata) +{ + return nvme_io(fd, nvme_cmd_write, slba, nblocks, control, dsmgmt, + reftag, apptag, appmask, data, metadata); +} + +static int nvme_compare(int fd, __u64 slba, __u16 nblocks, __u16 control, __u32 dsmgmt, + __u32 reftag, __u16 apptag, __u16 appmask, void *data, + void *metadata) +{ + return nvme_io(fd, nvme_cmd_compare, slba, nblocks, control, dsmgmt, + reftag, apptag, appmask, data, metadata); +} + +static int nvme_passthru_io(int fd, __u8 opcode, __u8 flags, __u16 rsvd, + __u32 nsid, __u32 cdw2, __u32 cdw3, __u32 cdw10, + __u32 cdw11, __u32 cdw12, __u32 cdw13, __u32 cdw14, + __u32 cdw15, __u32 data_len, void *data, + __u32 metadata_len, void *metadata, __u32 timeout_ms) +{ + return nvme_passthru(fd, NVME_IOCTL_IO_CMD, opcode, flags, rsvd, nsid, + cdw2, cdw3, cdw10, cdw11, cdw12, cdw13, cdw14, + cdw15, data_len, data, metadata_len, metadata, + timeout_ms, NULL); +} + +static int nvme_write_zeros(int fd, __u32 nsid, __u64 slba, __u16 nlb, + __u16 control, __u32 reftag, __u16 apptag, __u16 appmask) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_write_zeroes, + .nsid = nsid, + .cdw10 = slba & 0xffffffff, + .cdw11 = slba >> 32, + .cdw12 = nlb | (control << 16), + .cdw14 = reftag, + .cdw15 = apptag | (appmask << 16), + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +static int nvme_write_uncorrectable(int fd, __u32 nsid, __u64 slba, __u16 nlb) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_write_uncor, + .nsid = nsid, + .cdw10 = slba & 0xffffffff, + .cdw11 = slba >> 32, + .cdw12 = nlb, + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +static int nvme_flush(int fd, __u32 nsid) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_flush, + .nsid = nsid, + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +static int nvme_dsm(int fd, __u32 nsid, __u32 cdw11, struct nvme_dsm_range *dsm, + __u16 nr_ranges) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_dsm, + .nsid = nsid, + .addr = (__u64)(uintptr_t) dsm, + .data_len = nr_ranges * sizeof(*dsm), + .cdw10 = nr_ranges - 1, + .cdw11 = cdw11, + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +static struct nvme_dsm_range *nvme_setup_dsm_range(__u32 *ctx_attrs, __u32 *llbas, + __u64 *slbas, __u16 nr_ranges) +{ + int i; + struct nvme_dsm_range *dsm = malloc(nr_ranges * sizeof(*dsm)); + + if (!dsm) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + return NULL; + } + for (i = 0; i < nr_ranges; i++) { + dsm[i].cattr = cpu_to_le32(ctx_attrs[i]); + dsm[i].nlb = cpu_to_le32(llbas[i]); + dsm[i].slba = cpu_to_le64(slbas[i]); + } + return dsm; +} + +static int nvme_resv_acquire(int fd, __u32 nsid, __u8 rtype, __u8 racqa, + bool iekey, __u64 crkey, __u64 nrkey) +{ + __le64 payload[2] = { cpu_to_le64(crkey), cpu_to_le64(nrkey) }; + __u32 cdw10 = (racqa & 0x7) | (iekey ? 1 << 3 : 0) | rtype << 8; + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_resv_acquire, + .nsid = nsid, + .cdw10 = cdw10, + .addr = (__u64)(uintptr_t) (payload), + .data_len = sizeof(payload), + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +static int nvme_resv_register(int fd, __u32 nsid, __u8 rrega, __u8 cptpl, + bool iekey, __u64 crkey, __u64 nrkey) +{ + __le64 payload[2] = { cpu_to_le64(crkey), cpu_to_le64(nrkey) }; + __u32 cdw10 = (rrega & 0x7) | (iekey ? 1 << 3 : 0) | cptpl << 30; + + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_resv_register, + .nsid = nsid, + .cdw10 = cdw10, + .addr = (__u64)(uintptr_t) (payload), + .data_len = sizeof(payload), + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +static int nvme_resv_release(int fd, __u32 nsid, __u8 rtype, __u8 rrela, + bool iekey, __u64 crkey) +{ + __le64 payload[1] = { cpu_to_le64(crkey) }; + __u32 cdw10 = (rrela & 0x7) | (iekey ? 1 << 3 : 0) | rtype << 8; + + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_resv_release, + .nsid = nsid, + .cdw10 = cdw10, + .addr = (__u64)(uintptr_t) (payload), + .data_len = sizeof(payload), + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +static int nvme_resv_report(int fd, __u32 nsid, __u32 numd, __u32 cdw11, void *data) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_resv_report, + .nsid = nsid, + .cdw10 = numd, + .cdw11 = cdw11, + .addr = (__u64)(uintptr_t) data, + .data_len = (numd + 1) << 2, + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +static int nvme_identify13(int fd, __u32 nsid, __u32 cdw10, __u32 cdw11, void *data) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_identify, + .nsid = nsid, + .addr = (__u64)(uintptr_t) data, + .data_len = NVME_IDENTIFY_DATA_SIZE, + .cdw10 = cdw10, + .cdw11 = cdw11, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} + +static int nvme_identify(int fd, __u32 nsid, __u32 cdw10, void *data) +{ + return nvme_identify13(fd, nsid, cdw10, 0, data); +} + +static int nvme_identify_ctrl(int fd, void *data) +{ + return nvme_identify(fd, 0, 1, data); +} + +static int nvme_identify_ns(int fd, __u32 nsid, bool present, void *data) +{ + int cns = present ? NVME_ID_CNS_NS_PRESENT : NVME_ID_CNS_NS; + + return nvme_identify(fd, nsid, cns, data); +} + +static int nvme_identify_ns_list(int fd, __u32 nsid, bool all, void *data) +{ + int cns = all ? NVME_ID_CNS_NS_PRESENT_LIST : NVME_ID_CNS_NS_ACTIVE_LIST; + + return nvme_identify(fd, nsid, cns, data); +} + +static int nvme_identify_ctrl_list(int fd, __u32 nsid, __u16 cntid, void *data) +{ + int cns = nsid ? NVME_ID_CNS_CTRL_NS_LIST : NVME_ID_CNS_CTRL_LIST; + + return nvme_identify(fd, nsid, (cntid << 16) | cns, data); +} + +static int nvme_identify_ns_descs(int fd, __u32 nsid, void *data) +{ + + return nvme_identify(fd, nsid, NVME_ID_CNS_NS_DESC_LIST, data); +} + +static int nvme_identify_nvmset(int fd, __u16 nvmset_id, void *data) +{ + return nvme_identify13(fd, 0, NVME_ID_CNS_NVMSET_LIST, nvmset_id, data); +} + +static int nvme_get_log13(int fd, __u32 nsid, __u8 log_id, __u8 lsp, __u64 lpo, + __u16 lsi, bool rae, __u32 data_len, void *data) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_get_log_page, + .nsid = nsid, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + }; + __u32 numd = (data_len >> 2) - 1; + __u16 numdu = numd >> 16, numdl = numd & 0xffff; + + cmd.cdw10 = log_id | (numdl << 16) | (rae ? 1 << 15 : 0); + if (lsp) + cmd.cdw10 |= lsp << 8; + + cmd.cdw11 = numdu | (lsi << 16); + cmd.cdw12 = lpo; + cmd.cdw13 = (lpo >> 32); + + return nvme_submit_admin_passthru(fd, &cmd); + +} + +static int nvme_get_log(int fd, __u32 nsid, __u8 log_id, bool rae, + __u32 data_len, void *data) +{ + void *ptr = data; + __u32 offset = 0, xfer_len = data_len; + int ret; + + /* + * 4k is the smallest possible transfer unit, so by + * restricting ourselves for 4k transfers we avoid having + * to check the MDTS value of the controller. + */ + do { + xfer_len = data_len - offset; + if (xfer_len > 4096) + xfer_len = 4096; + + ret = nvme_get_log13(fd, nsid, log_id, NVME_NO_LOG_LSP, + offset, 0, rae, xfer_len, ptr); + if (ret) + return ret; + + offset += xfer_len; + ptr += xfer_len; + } while (offset < data_len); + + return 0; +} + +static int nvme_get_telemetry_log(int fd, void *lp, int generate_report, + int ctrl_init, size_t log_page_size, __u64 offset) +{ + if (ctrl_init) + return nvme_get_log13(fd, NVME_NSID_ALL, NVME_LOG_TELEMETRY_CTRL, + NVME_NO_LOG_LSP, offset, + 0, 1, log_page_size, lp); + if (generate_report) + return nvme_get_log13(fd, NVME_NSID_ALL, NVME_LOG_TELEMETRY_HOST, + NVME_TELEM_LSP_CREATE, offset, + 0, 1, log_page_size, lp); + else + return nvme_get_log13(fd, NVME_NSID_ALL, NVME_LOG_TELEMETRY_HOST, + NVME_NO_LOG_LSP, offset, + 0, 1, log_page_size, lp); +} + +static int nvme_fw_log(int fd, struct nvme_firmware_log_page *fw_log) +{ + return nvme_get_log(fd, NVME_NSID_ALL, NVME_LOG_FW_SLOT, true, + sizeof(*fw_log), fw_log); +} + +static int nvme_changed_ns_list_log(int fd, struct nvme_changed_ns_list_log *changed_ns_list_log) +{ + return nvme_get_log(fd, 0, NVME_LOG_CHANGED_NS, true, + sizeof(changed_ns_list_log->log), + changed_ns_list_log->log); +} + +static int nvme_error_log(int fd, int entries, struct nvme_error_log_page *err_log) +{ + return nvme_get_log(fd, NVME_NSID_ALL, NVME_LOG_ERROR, false, + entries * sizeof(*err_log), err_log); +} + +static int nvme_endurance_log(int fd, __u16 group_id, struct nvme_endurance_group_log *endurance_log) +{ + return nvme_get_log13(fd, 0, NVME_LOG_ENDURANCE_GROUP, 0, 0, group_id, 0, + sizeof(*endurance_log), endurance_log); +} + +static int nvme_smart_log(int fd, __u32 nsid, struct nvme_smart_log *smart_log) +{ + return nvme_get_log(fd, nsid, NVME_LOG_SMART, false, + sizeof(*smart_log), smart_log); +} + +static int nvme_ana_log(int fd, void *ana_log, size_t ana_log_len, int rgo) +{ + __u64 lpo = 0; + + return nvme_get_log13(fd, NVME_NSID_ALL, NVME_LOG_ANA, rgo, lpo, 0, + true, ana_log_len, ana_log); +} + +static int nvme_self_test_log(int fd, struct nvme_self_test_log *self_test_log) +{ + return nvme_get_log(fd, NVME_NSID_ALL, NVME_LOG_DEVICE_SELF_TEST, false, + sizeof(*self_test_log), self_test_log); +} + +static int nvme_effects_log(int fd, struct nvme_effects_log_page *effects_log) +{ + return nvme_get_log(fd, 0, NVME_LOG_CMD_EFFECTS, false, + sizeof(*effects_log), effects_log); +} + +static int nvme_discovery_log(int fd, struct nvmf_disc_rsp_page_hdr *log, __u32 size) +{ + return nvme_get_log(fd, 0, NVME_LOG_DISC, false, size, log); +} + +static int nvme_sanitize_log(int fd, struct nvme_sanitize_log_page *sanitize_log) +{ + return nvme_get_log(fd, 0, NVME_LOG_SANITIZE, false, + sizeof(*sanitize_log), sanitize_log); +} + +static int nvme_feature(int fd, __u8 opcode, __u32 nsid, __u32 cdw10, __u32 cdw11, + __u32 cdw12, __u32 data_len, void *data, __u32 *result) +{ + struct nvme_admin_cmd cmd = { + .opcode = opcode, + .nsid = nsid, + .cdw10 = cdw10, + .cdw11 = cdw11, + .cdw12 = cdw12, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + }; + int err; + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +static int nvme_set_feature(int fd, __u32 nsid, __u8 fid, __u32 value, __u32 cdw12, + bool save, __u32 data_len, void *data, __u32 *result) +{ + __u32 cdw10 = fid | (save ? 0x80000000 : 0); + + return nvme_feature(fd, nvme_admin_set_features, nsid, cdw10, value, + cdw12, data_len, data, result); +} + +static int nvme_property(int fd, __u8 fctype, __le32 off, __le64 *value, __u8 attrib) +{ + int err; + struct nvme_admin_cmd cmd = { + .opcode = nvme_fabrics_command, + .cdw10 = attrib, + .cdw11 = off, + }; + + if (!value) { + errno = EINVAL; + return -errno; + } + + if (fctype == nvme_fabrics_type_property_get){ + cmd.nsid = nvme_fabrics_type_property_get; + } else if(fctype == nvme_fabrics_type_property_set) { + cmd.nsid = nvme_fabrics_type_property_set; + cmd.cdw12 = *value; + } else { + errno = EINVAL; + return -errno; + } + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && fctype == nvme_fabrics_type_property_get) + *value = cpu_to_le64(cmd.result); + return err; +} + +static int get_property_helper(int fd, int offset, void *value, int *advance) +{ + __le64 value64; + int err = -EINVAL; + + switch (offset) { + case NVME_REG_CAP: + case NVME_REG_ASQ: + case NVME_REG_ACQ: + *advance = 8; + break; + default: + *advance = 4; + } + + if (!value) + return err; + + err = nvme_property(fd, nvme_fabrics_type_property_get, + cpu_to_le32(offset), &value64, (*advance == 8)); + + if (!err) { + if (*advance == 8) + *((uint64_t *)value) = le64_to_cpu(value64); + else + *((uint32_t *)value) = le32_to_cpu(value64); + } + + return err; +} + +static int nvme_get_property(int fd, int offset, uint64_t *value) +{ + int advance; + return get_property_helper(fd, offset, value, &advance); +} + +static int nvme_get_properties(int fd, void **pbar) +{ + int offset, advance; + int err, ret = -EINVAL; + int size = getpagesize(); + + *pbar = malloc(size); + if (!*pbar) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + return -ENOMEM; + } + + memset(*pbar, 0xff, size); + for (offset = NVME_REG_CAP; offset <= NVME_REG_CMBSZ; offset += advance) { + err = get_property_helper(fd, offset, *pbar + offset, &advance); + if (!err) + ret = 0; + } + + return ret; +} + +static int nvme_set_property(int fd, int offset, int value) +{ + __le64 val = cpu_to_le64(value); + __le32 off = cpu_to_le32(offset); + bool is64bit; + + switch (off) { + case NVME_REG_CAP: + case NVME_REG_ASQ: + case NVME_REG_ACQ: + is64bit = true; + break; + default: + is64bit = false; + } + + return nvme_property(fd, nvme_fabrics_type_property_set, + off, &val, is64bit ? 1: 0); +} + +static int nvme_get_feature(int fd, __u32 nsid, __u8 fid, __u8 sel, __u32 cdw11, + __u32 data_len, void *data, __u32 *result) +{ + __u32 cdw10 = fid | sel << 8; + + return nvme_feature(fd, nvme_admin_get_features, nsid, cdw10, cdw11, + 0, data_len, data, result); +} + +static int nvme_format(int fd, __u32 nsid, __u8 lbaf, __u8 ses, __u8 pi, + __u8 pil, __u8 ms, __u32 timeout) +{ + __u32 cdw10 = lbaf | ms << 4 | pi << 5 | pil << 8 | ses << 9; + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_format_nvm, + .nsid = nsid, + .cdw10 = cdw10, + .timeout_ms = timeout, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} + +static int nvme_ns_create(int fd, __u64 nsze, __u64 ncap, __u8 flbas, + __u8 dps, __u8 nmic, __u32 *result) +{ + struct nvme_id_ns ns = { + .nsze = cpu_to_le64(nsze), + .ncap = cpu_to_le64(ncap), + .flbas = flbas, + .dps = dps, + .nmic = nmic, + }; + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_ns_mgmt, + .addr = (__u64)(uintptr_t) ((void *)&ns), + .cdw10 = 0, + .data_len = 0x1000, + }; + int err; + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +static int nvme_ns_delete(int fd, __u32 nsid) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_ns_mgmt, + .nsid = nsid, + .cdw10 = 1, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} + +static int nvme_ns_attachment(int fd, __u32 nsid, __u16 num_ctrls, __u16 *ctrlist, + bool attach) +{ + int i; + __u8 buf[0x1000]; + struct nvme_controller_list *cntlist = + (struct nvme_controller_list *)buf; + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_ns_attach, + .nsid = nsid, + .addr = (__u64)(uintptr_t) cntlist, + .cdw10 = attach ? 0 : 1, + .data_len = 0x1000, + }; + + memset(buf, 0, sizeof(buf)); + cntlist->num = cpu_to_le16(num_ctrls); + for (i = 0; i < num_ctrls; i++) + cntlist->identifier[i] = cpu_to_le16(ctrlist[i]); + + return nvme_submit_admin_passthru(fd, &cmd); +} + +static int nvme_ns_attach_ctrls(int fd, __u32 nsid, __u16 num_ctrls, __u16 *ctrlist) +{ + return nvme_ns_attachment(fd, nsid, num_ctrls, ctrlist, true); +} + +static int nvme_ns_detach_ctrls(int fd, __u32 nsid, __u16 num_ctrls, __u16 *ctrlist) +{ + return nvme_ns_attachment(fd, nsid, num_ctrls, ctrlist, false); +} + +static int nvme_fw_download(int fd, __u32 offset, __u32 data_len, void *data) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_download_fw, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + .cdw10 = (data_len >> 2) - 1, + .cdw11 = offset >> 2, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} + +static int nvme_fw_commit(int fd, __u8 slot, __u8 action, __u8 bpid) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_activate_fw, + .cdw10 = (bpid << 31) | (action << 3) | slot, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} + +static int nvme_sec_send(int fd, __u32 nsid, __u8 nssf, __u16 spsp, + __u8 secp, __u32 tl, __u32 data_len, void *data, __u32 *result) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_security_send, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + .nsid = nsid, + .cdw10 = secp << 24 | spsp << 8 | nssf, + .cdw11 = tl, + }; + int err; + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +static int nvme_sec_recv(int fd, __u32 nsid, __u8 nssf, __u16 spsp, + __u8 secp, __u32 al, __u32 data_len, void *data, __u32 *result) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_security_recv, + .nsid = nsid, + .cdw10 = secp << 24 | spsp << 8 | nssf, + .cdw11 = al, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + }; + int err; + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +static int nvme_dir_send(int fd, __u32 nsid, __u16 dspec, __u8 dtype, __u8 doper, + __u32 data_len, __u32 dw12, void *data, __u32 *result) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_directive_send, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + .nsid = nsid, + .cdw10 = data_len? (data_len >> 2) - 1 : 0, + .cdw11 = dspec << 16 | dtype << 8 | doper, + .cdw12 = dw12, + }; + int err; + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +static int nvme_dir_recv(int fd, __u32 nsid, __u16 dspec, __u8 dtype, __u8 doper, + __u32 data_len, __u32 dw12, void *data, __u32 *result) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_directive_recv, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + .nsid = nsid, + .cdw10 = data_len? (data_len >> 2) - 1 : 0, + .cdw11 = dspec << 16 | dtype << 8 | doper, + .cdw12 = dw12, + }; + int err; + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +static int nvme_sanitize(int fd, __u8 sanact, __u8 ause, __u8 owpass, __u8 oipbp, + __u8 no_dealloc, __u32 ovrpat) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_sanitize_nvm, + .cdw10 = no_dealloc << 9 | oipbp << 8 | + owpass << NVME_SANITIZE_OWPASS_SHIFT | + ause << 3 | sanact, + .cdw11 = ovrpat, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} + +static int nvme_self_test_start(int fd, __u32 nsid, __u32 cdw10) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_dev_self_test, + .nsid = nsid, + .cdw10 = cdw10, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} Index: multipath-tools-130222/libmultipath/nvme-ioctl.h =================================================================== --- /dev/null +++ multipath-tools-130222/libmultipath/nvme-ioctl.h @@ -0,0 +1,139 @@ +#ifndef _NVME_LIB_H +#define _NVME_LIB_H + +#include +#include +#include "linux/nvme_ioctl.h" +#include "nvme.h" + +static int nvme_get_nsid(int fd); + +/* Generic passthrough */ +static int nvme_submit_passthru(int fd, unsigned long ioctl_cmd, + struct nvme_passthru_cmd *cmd); + +static int nvme_passthru(int fd, unsigned long ioctl_cmd, __u8 opcode, __u8 flags, + __u16 rsvd, __u32 nsid, __u32 cdw2, __u32 cdw3, + __u32 cdw10, __u32 cdw11, __u32 cdw12, + __u32 cdw13, __u32 cdw14, __u32 cdw15, + __u32 data_len, void *data, __u32 metadata_len, + void *metadata, __u32 timeout_ms, __u32 *result); + +/* NVME_SUBMIT_IO */ +static int nvme_io(int fd, __u8 opcode, __u64 slba, __u16 nblocks, __u16 control, + __u32 dsmgmt, __u32 reftag, __u16 apptag, + __u16 appmask, void *data, void *metadata); + +static int nvme_read(int fd, __u64 slba, __u16 nblocks, __u16 control, + __u32 dsmgmt, __u32 reftag, __u16 apptag, + __u16 appmask, void *data, void *metadata); + +static int nvme_write(int fd, __u64 slba, __u16 nblocks, __u16 control, + __u32 dsmgmt, __u32 reftag, __u16 apptag, + __u16 appmask, void *data, void *metadata); + +static int nvme_compare(int fd, __u64 slba, __u16 nblocks, __u16 control, + __u32 dsmgmt, __u32 reftag, __u16 apptag, + __u16 appmask, void *data, void *metadata); + +/* NVME_IO_CMD */ +static int nvme_passthru_io(int fd, __u8 opcode, __u8 flags, __u16 rsvd, + __u32 nsid, __u32 cdw2, __u32 cdw3, + __u32 cdw10, __u32 cdw11, __u32 cdw12, + __u32 cdw13, __u32 cdw14, __u32 cdw15, + __u32 data_len, void *data, __u32 metadata_len, + void *metadata, __u32 timeout); + +static int nvme_write_zeros(int fd, __u32 nsid, __u64 slba, __u16 nlb, + __u16 control, __u32 reftag, __u16 apptag, __u16 appmask); + +static int nvme_write_uncorrectable(int fd, __u32 nsid, __u64 slba, __u16 nlb); + +static int nvme_flush(int fd, __u32 nsid); + +static int nvme_dsm(int fd, __u32 nsid, __u32 cdw11, struct nvme_dsm_range *dsm, + __u16 nr_ranges); +static struct nvme_dsm_range *nvme_setup_dsm_range(__u32 *ctx_attrs, + __u32 *llbas, __u64 *slbas, + __u16 nr_ranges); + +static int nvme_resv_acquire(int fd, __u32 nsid, __u8 rtype, __u8 racqa, + bool iekey, __u64 crkey, __u64 nrkey); +static int nvme_resv_register(int fd, __u32 nsid, __u8 rrega, __u8 cptpl, + bool iekey, __u64 crkey, __u64 nrkey); +static int nvme_resv_release(int fd, __u32 nsid, __u8 rtype, __u8 rrela, + bool iekey, __u64 crkey); +static int nvme_resv_report(int fd, __u32 nsid, __u32 numd, __u32 cdw11, void *data); + +static int nvme_identify13(int fd, __u32 nsid, __u32 cdw10, __u32 cdw11, void *data); +static int nvme_identify(int fd, __u32 nsid, __u32 cdw10, void *data); +static int nvme_identify_ctrl(int fd, void *data); +static int nvme_identify_ns(int fd, __u32 nsid, bool present, void *data); +static int nvme_identify_ns_list(int fd, __u32 nsid, bool all, void *data); +static int nvme_identify_ctrl_list(int fd, __u32 nsid, __u16 cntid, void *data); +static int nvme_identify_ns_descs(int fd, __u32 nsid, void *data); +static int nvme_identify_nvmset(int fd, __u16 nvmset_id, void *data); +static int nvme_get_log13(int fd, __u32 nsid, __u8 log_id, __u8 lsp, __u64 lpo, + __u16 group_id, bool rae, __u32 data_len, void *data); +static int nvme_get_log(int fd, __u32 nsid, __u8 log_id, bool rae, + __u32 data_len, void *data); + + +static int nvme_get_telemetry_log(int fd, void *lp, int generate_report, + int ctrl_gen, size_t log_page_size, __u64 offset); +static int nvme_fw_log(int fd, struct nvme_firmware_log_page *fw_log); +static int nvme_changed_ns_list_log(int fd, + struct nvme_changed_ns_list_log *changed_ns_list_log); +static int nvme_error_log(int fd, int entries, struct nvme_error_log_page *err_log); +static int nvme_smart_log(int fd, __u32 nsid, struct nvme_smart_log *smart_log); +static int nvme_ana_log(int fd, void *ana_log, size_t ana_log_len, int rgo); +static int nvme_effects_log(int fd, struct nvme_effects_log_page *effects_log); +static int nvme_discovery_log(int fd, struct nvmf_disc_rsp_page_hdr *log, __u32 size); +static int nvme_sanitize_log(int fd, struct nvme_sanitize_log_page *sanitize_log); +static int nvme_endurance_log(int fd, __u16 group_id, + struct nvme_endurance_group_log *endurance_log); + +static int nvme_feature(int fd, __u8 opcode, __u32 nsid, __u32 cdw10, + __u32 cdw11, __u32 cdw12, __u32 data_len, void *data, + __u32 *result); +static int nvme_set_feature(int fd, __u32 nsid, __u8 fid, __u32 value, __u32 cdw12, + bool save, __u32 data_len, void *data, __u32 *result); +static int nvme_get_feature(int fd, __u32 nsid, __u8 fid, __u8 sel, + __u32 cdw11, __u32 data_len, void *data, __u32 *result); + +static int nvme_format(int fd, __u32 nsid, __u8 lbaf, __u8 ses, __u8 pi, + __u8 pil, __u8 ms, __u32 timeout); + +static int nvme_ns_create(int fd, __u64 nsze, __u64 ncap, __u8 flbas, + __u8 dps, __u8 nmic, __u32 *result); +static int nvme_ns_delete(int fd, __u32 nsid); + +static int nvme_ns_attachment(int fd, __u32 nsid, __u16 num_ctrls, + __u16 *ctrlist, bool attach); +static int nvme_ns_attach_ctrls(int fd, __u32 nsid, __u16 num_ctrls, __u16 *ctrlist); +static int nvme_ns_detach_ctrls(int fd, __u32 nsid, __u16 num_ctrls, __u16 *ctrlist); + +static int nvme_fw_download(int fd, __u32 offset, __u32 data_len, void *data); +static int nvme_fw_commit(int fd, __u8 slot, __u8 action, __u8 bpid); + +static int nvme_sec_send(int fd, __u32 nsid, __u8 nssf, __u16 spsp, + __u8 secp, __u32 tl, __u32 data_len, void *data, __u32 *result); +static int nvme_sec_recv(int fd, __u32 nsid, __u8 nssf, __u16 spsp, + __u8 secp, __u32 al, __u32 data_len, void *data, __u32 *result); + +static int nvme_subsystem_reset(int fd); +static int nvme_reset_controller(int fd); +static int nvme_ns_rescan(int fd); + +static int nvme_dir_send(int fd, __u32 nsid, __u16 dspec, __u8 dtype, __u8 doper, + __u32 data_len, __u32 dw12, void *data, __u32 *result); +static int nvme_dir_recv(int fd, __u32 nsid, __u16 dspec, __u8 dtype, __u8 doper, + __u32 data_len, __u32 dw12, void *data, __u32 *result); +static int nvme_get_properties(int fd, void **pbar); +static int nvme_set_property(int fd, int offset, int value); +static int nvme_get_property(int fd, int offset, uint64_t *value); +static int nvme_sanitize(int fd, __u8 sanact, __u8 ause, __u8 owpass, __u8 oipbp, + __u8 no_dealloc, __u32 ovrpat); +static int nvme_self_test_start(int fd, __u32 nsid, __u32 cdw10); +static int nvme_self_test_log(int fd, struct nvme_self_test_log *self_test_log); +#endif /* _NVME_LIB_H */ Index: multipath-tools-130222/libmultipath/nvme-lib.c =================================================================== --- /dev/null +++ multipath-tools-130222/libmultipath/nvme-lib.c @@ -0,0 +1,49 @@ +#include +/* avoid inclusion of standard API */ +#define _NVME_LIB_C 1 +#include "nvme-lib.h" +#include "nvme-ioctl.c" +#include "debug.h" + +int log_nvme_errcode(int err, const char *dev, const char *msg) +{ + if (err > 0) + condlog(3, "%s: %s: NVMe status %d", dev, msg, err); + else if (err < 0) + condlog(3, "%s: %s: %s", dev, msg, strerror(errno)); + return err; +} + +int libmp_nvme_get_nsid(int fd) +{ + return nvme_get_nsid(fd); +} + +int libmp_nvme_identify_ctrl(int fd, struct nvme_id_ctrl *ctrl) +{ + return nvme_identify_ctrl(fd, ctrl); +} + +int libmp_nvme_identify_ns(int fd, __u32 nsid, bool present, + struct nvme_id_ns *ns) +{ + return nvme_identify_ns(fd, nsid, present, ns); +} + +int libmp_nvme_ana_log(int fd, void *ana_log, size_t ana_log_len, int rgo) +{ + return nvme_ana_log(fd, ana_log, ana_log_len, rgo); +} + +int nvme_id_ctrl_ana(int fd, struct nvme_id_ctrl *ctrl) +{ + int rc; + struct nvme_id_ctrl c; + + rc = nvme_identify_ctrl(fd, &c); + if (rc < 0) + return rc; + if (ctrl) + *ctrl = c; + return c.cmic & (1 << 3) ? 1 : 0; +} Index: multipath-tools-130222/libmultipath/nvme-lib.h =================================================================== --- /dev/null +++ multipath-tools-130222/libmultipath/nvme-lib.h @@ -0,0 +1,39 @@ +#ifndef NVME_LIB_H +#define NVME_LIB_H + +#include "nvme.h" + +int log_nvme_errcode(int err, const char *dev, const char *msg); +int libmp_nvme_get_nsid(int fd); +int libmp_nvme_identify_ctrl(int fd, struct nvme_id_ctrl *ctrl); +int libmp_nvme_identify_ns(int fd, __u32 nsid, bool present, + struct nvme_id_ns *ns); +int libmp_nvme_ana_log(int fd, void *ana_log, size_t ana_log_len, int rgo); +/* + * Identify controller, and return true if ANA is supported + * ctrl will be filled in if controller is identified, even w/o ANA + * ctrl may be NULL + */ +int nvme_id_ctrl_ana(int fd, struct nvme_id_ctrl *ctrl); + +#ifndef _NVME_LIB_C +/* + * In all files except nvme-lib.c, the nvme functions can be called + * by their usual name. + */ +#define nvme_get_nsid libmp_nvme_get_nsid +#define nvme_identify_ctrl libmp_nvme_identify_ctrl +#define nvme_identify_ns libmp_nvme_identify_ns +#define nvme_ana_log libmp_nvme_ana_log +/* + * Undefine these to avoid clashes with libmultipath's byteorder.h + */ +#undef cpu_to_le16 +#undef cpu_to_le32 +#undef cpu_to_le64 +#undef le16_to_cpu +#undef le32_to_cpu +#undef le64_to_cpu +#endif + +#endif /* NVME_LIB_H */ Index: multipath-tools-130222/libmultipath/prio.h =================================================================== --- multipath-tools-130222.orig/libmultipath/prio.h +++ multipath-tools-130222/libmultipath/prio.h @@ -29,6 +29,7 @@ struct path; #define PRIO_RDAC "rdac" #define PRIO_DATACORE "datacore" #define PRIO_WEIGHTED_PATH "weightedpath" +#define PRIO_ANA "ana" /* * Value used to mark the fact prio was not defined Index: multipath-tools-130222/libmultipath/prioritizers/Makefile =================================================================== --- multipath-tools-130222.orig/libmultipath/prioritizers/Makefile +++ multipath-tools-130222/libmultipath/prioritizers/Makefile @@ -2,6 +2,7 @@ # # Copyright (C) 2007 Christophe Varoqui, # +TOPDIR = ../.. include ../../Makefile.inc LIBS = \ @@ -15,9 +16,10 @@ LIBS = \ libpriodatacore.so \ libpriohds.so \ libprioweightedpath.so \ + libprioana.so \ libprioiet.so -CFLAGS += -fPIC -I.. +CFLAGS += -fPIC -I.. -I$(nvmedir) all: $(LIBS) Index: multipath-tools-130222/libmultipath/prioritizers/ana.c =================================================================== --- /dev/null +++ multipath-tools-130222/libmultipath/prioritizers/ana.c @@ -0,0 +1,236 @@ +/* + * (C) Copyright HUAWEI Technology Corp. 2017 All Rights Reserved. + * + * ana.c + * Version 1.00 + * + * Tool to make use of a NVMe-feature called Asymmetric Namespace Access. + * It determines the ANA state of a device and prints a priority value to stdout. + * + * Author(s): Cheng Jike + * Li Jie + * + * This file is released under the GPL version 2, or any later version. + */ +#include +#include +#include +#include +#include +#include + +#include "debug.h" +#include "nvme-lib.h" +#include "prio.h" +#include "util.h" +#include "structs.h" +#include "def_func.h" + +enum { + ANA_ERR_GETCTRL_FAILED = 1, + ANA_ERR_NOT_NVME, + ANA_ERR_NOT_SUPPORTED, + ANA_ERR_GETANAS_OVERFLOW, + ANA_ERR_GETANAS_NOTFOUND, + ANA_ERR_GETANALOG_FAILED, + ANA_ERR_GETNSID_FAILED, + ANA_ERR_GETNS_FAILED, + ANA_ERR_NO_MEMORY, + ANA_ERR_NO_INFORMATION, +}; + +static const char *ana_errmsg[] = { + [ANA_ERR_GETCTRL_FAILED] = "couldn't get ctrl info", + [ANA_ERR_NOT_NVME] = "not an NVMe device", + [ANA_ERR_NOT_SUPPORTED] = "ANA not supported", + [ANA_ERR_GETANAS_OVERFLOW] = "buffer overflow in ANA log", + [ANA_ERR_GETANAS_NOTFOUND] = "NSID or ANAGRPID not found", + [ANA_ERR_GETANALOG_FAILED] = "couldn't get ana log", + [ANA_ERR_GETNSID_FAILED] = "couldn't get NSID", + [ANA_ERR_GETNS_FAILED] = "couldn't get namespace info", + [ANA_ERR_NO_MEMORY] = "out of memory", + [ANA_ERR_NO_INFORMATION] = "invalid fd", +}; + +static const char *anas_string[] = { + [NVME_ANA_OPTIMIZED] = "ANA Optimized State", + [NVME_ANA_NONOPTIMIZED] = "ANA Non-Optimized State", + [NVME_ANA_INACCESSIBLE] = "ANA Inaccessible State", + [NVME_ANA_PERSISTENT_LOSS] = "ANA Persistent Loss State", + [NVME_ANA_CHANGE] = "ANA Change state", +}; + +static const char *aas_print_string(int rc) +{ + rc &= 0xff; + if (rc >= 0 && rc < ARRAY_SIZE(anas_string) && + anas_string[rc] != NULL) + return anas_string[rc]; + + return "invalid ANA state"; +} + +static int get_ana_state(__u32 nsid, __u32 anagrpid, void *ana_log, + size_t ana_log_len) +{ + void *base = ana_log; + struct nvme_ana_rsp_hdr *hdr = base; + struct nvme_ana_group_desc *ana_desc; + size_t offset = sizeof(struct nvme_ana_rsp_hdr); + __u32 nr_nsids; + size_t nsid_buf_size; + int i, j; + + for (i = 0; i < le16_to_cpu(hdr->ngrps); i++) { + ana_desc = base + offset; + + offset += sizeof(*ana_desc); + if (offset > ana_log_len) + return -ANA_ERR_GETANAS_OVERFLOW; + + nr_nsids = le32_to_cpu(ana_desc->nnsids); + nsid_buf_size = nr_nsids * sizeof(__le32); + + offset += nsid_buf_size; + if (offset > ana_log_len) + return -ANA_ERR_GETANAS_OVERFLOW; + + for (j = 0; j < nr_nsids; j++) { + if (nsid == le32_to_cpu(ana_desc->nsids[j])) + return ana_desc->state; + } + + if (anagrpid != 0 && anagrpid == le32_to_cpu(ana_desc->grpid)) + return ana_desc->state; + + } + return -ANA_ERR_GETANAS_NOTFOUND; +} + +int get_ana_info(struct path * pp, unsigned int timeout) +{ + int rc; + __u32 nsid; + struct nvme_id_ctrl ctrl; + struct nvme_id_ns ns; + void *ana_log; + size_t ana_log_len; + bool is_anagrpid_const; + + rc = nvme_id_ctrl_ana(pp->fd, &ctrl); + if (rc < 0) { + log_nvme_errcode(rc, pp->dev, "nvme_identify_ctrl"); + return -ANA_ERR_GETCTRL_FAILED; + } else if (rc == 0) + return -ANA_ERR_NOT_SUPPORTED; + + nsid = nvme_get_nsid(pp->fd); + if (nsid <= 0) { + log_nvme_errcode(rc, pp->dev, "nvme_get_nsid"); + return -ANA_ERR_GETNSID_FAILED; + } + is_anagrpid_const = ctrl.anacap & (1 << 6); + + /* + * Code copied from nvme-cli/nvme.c. We don't need to allocate an + * [nanagrpid*mnan] array of NSIDs because each NSID can occur at most + * in one ANA group. + */ + ana_log_len = sizeof(struct nvme_ana_rsp_hdr) + + le32_to_cpu(ctrl.nanagrpid) + * sizeof(struct nvme_ana_group_desc); + + if (is_anagrpid_const) { + rc = nvme_identify_ns(pp->fd, nsid, 0, &ns); + if (rc) { + log_nvme_errcode(rc, pp->dev, "nvme_identify_ns"); + return -ANA_ERR_GETNS_FAILED; + } + } else + ana_log_len += le32_to_cpu(ctrl.mnan) * sizeof(__le32); + + ana_log = malloc(ana_log_len); + if (!ana_log) + return -ANA_ERR_NO_MEMORY; + pthread_cleanup_push(free, ana_log); + rc = nvme_ana_log(pp->fd, ana_log, ana_log_len, + is_anagrpid_const ? NVME_ANA_LOG_RGO : 0); + if (rc) { + log_nvme_errcode(rc, pp->dev, "nvme_ana_log"); + rc = -ANA_ERR_GETANALOG_FAILED; + } else + rc = get_ana_state(nsid, + is_anagrpid_const ? + le32_to_cpu(ns.anagrpid) : 0, + ana_log, ana_log_len); + pthread_cleanup_pop(1); + if (rc >= 0) + condlog(3, "%s: ana state = %02x [%s]", pp->dev, rc, + aas_print_string(rc)); + return rc; +} + +/* + * Priorities modeled roughly after the ALUA model (alua.c/sysfs.c) + * Reference: ANA Base Protocol (NVMe TP 4004a, 11/13/2018). + * + * Differences: + * + * - The ANA base spec defines no implicit or explicit (STPG) state management. + * If a state is encountered that doesn't allow normal I/O (all except + * OPTIMIZED and NON_OPTIMIZED), we can't do anything but either wait for a + * Access State Change Notice (can't do that in multipathd as we don't receive + * those), or retry commands in regular time intervals until ANATT is expired + * (not implemented). Mapping UNAVAILABLE state to ALUA STANDBY is the best we + * can currently do. + * + * FIXME: Waiting for ANATT could be implemented with a "delayed failback" + * mechanism. The current "failback" method can't be used, as it would + * affect failback to every state, and here only failback to UNAVAILABLE + * should be delayed. + * + * - PERSISTENT_LOSS state is even below ALUA's UNAVAILABLE state. + * FIXME: According to the ANA TP, accessing paths in PERSISTENT_LOSS state + * in any way makes no sense (e.g. §8.19.6 - paths in this state shouldn't + * even be checked under "all paths down" conditions). Device mapper can, + * and will, select a PG for IO if it has non-failed paths, even if the + * PG has priority 0. We could avoid that only with an "ANA path checker". + * + * - ALUA has no CHANGE state. The ANA TP §8.18.3 / §8.19.4 suggests + * that CHANGE state should be treated in roughly the same way as + * INACCESSIBLE. Therefore we assign the same prio to it. + * + * - ALUA's LBA-dependent state has no ANA equivalent. + */ + +int getprio(struct path *pp, char *args) +{ + int rc; + + if (pp->fd < 0) + rc = -ANA_ERR_NO_INFORMATION; + else + rc = get_ana_info(pp, get_prio_timeout(60000)); + + switch (rc) { + case NVME_ANA_OPTIMIZED: + return 50; + case NVME_ANA_NONOPTIMIZED: + return 10; + case NVME_ANA_INACCESSIBLE: + case NVME_ANA_CHANGE: + return 1; + case NVME_ANA_PERSISTENT_LOSS: + return 0; + default: + break; + } + if (rc < 0 && -rc < ARRAY_SIZE(ana_errmsg)) + condlog(2, "%s: ANA error: %s", pp->dev, ana_errmsg[-rc]); + else + condlog(1, "%s: invalid ANA rc code %d", pp->dev, rc); + return -1; +} + +declare_nop_prio(initprio) +declare_nop_prio(freeprio) Index: multipath-tools-130222/libmultipath/util.h =================================================================== --- multipath-tools-130222.orig/libmultipath/util.h +++ multipath-tools-130222/libmultipath/util.h @@ -18,6 +18,8 @@ int parse_prkey(char *ptr, uint64_t *prk int parse_prkey_flags(char *ptr, uint64_t *prkey, uint8_t *flags); int safe_write(int fd, const void *buf, size_t count); +#define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) + #define safe_sprintf(var, format, args...) \ snprintf(var, sizeof(var), format, ##args) >= sizeof(var) #define safe_snprintf(var, size, format, args...) \ Index: multipath-tools-130222/multipath/multipath.conf.5 =================================================================== --- multipath-tools-130222.orig/multipath/multipath.conf.5 +++ multipath-tools-130222/multipath/multipath.conf.5 @@ -196,6 +196,9 @@ Generate the path priority for LSI/Engen Generate the path priority for Compaq/HP controller in active/standby mode. .TP +.B ana +Generate the path priority based on the NVMe ANA settings. +.TP .B hds Generate the path priority for Hitachi HDS Modular storage arrays. .TP Index: multipath-tools-130222/libmultipath/propsel.c =================================================================== --- multipath-tools-130222.orig/libmultipath/propsel.c +++ multipath-tools-130222/libmultipath/propsel.c @@ -5,6 +5,7 @@ */ #include +#include "nvme-lib.h" #include "checkers.h" #include "memory.h" #include "vector.h" @@ -489,8 +490,13 @@ select_getuid (struct path * pp) void detect_prio(struct path * pp) { - if (detect_alua(pp)) - prio_get(&pp->prio, PRIO_ALUA, DEFAULT_PRIO_ARGS); + if (pp->bus == SYSFS_BUS_NVME) { + if (nvme_id_ctrl_ana(pp->fd, NULL) == 1) + prio_get(&pp->prio, PRIO_ANA, DEFAULT_PRIO_ARGS); + } else if (pp->bus == SYSFS_BUS_SCSI) { + if (detect_alua(pp)) + prio_get(&pp->prio, PRIO_ALUA, DEFAULT_PRIO_ARGS); + } } extern int Index: multipath-tools-130222/libmultipath/hwtable.c =================================================================== --- multipath-tools-130222.orig/libmultipath/hwtable.c +++ multipath-tools-130222/libmultipath/hwtable.c @@ -1178,6 +1178,7 @@ static struct hwentry default_hw[] = { .vendor = "NVME", .product = ".*", .uid_attribute = "ID_WWN", + .detect_prio = DETECT_PRIO_ON, .checker_name = NONE, }, /*