diff --git a/Jenkinsfile b/Jenkinsfile index f9ce324a937..1c88534f985 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -277,6 +277,9 @@ pipeline { booleanParam(name: 'CI_medium_md_on_ssd_TEST', defaultValue: false, description: 'Run the Functional Hardware Medium MD on SSD test stage') + booleanParam(name: 'CI_medium_vmd_TEST', + defaultValue: true, + description: 'Run the Functional Hardware Medium VMD test stage') booleanParam(name: 'CI_medium_verbs_provider_TEST', defaultValue: false, description: 'Run the Functional Hardware Medium Verbs Provider test stage') @@ -310,6 +313,9 @@ pipeline { string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_VERBS_PROVIDER_LABEL', defaultValue: 'ci_nvme5', description: 'Label to use for 5 node Functional Hardware Medium Verbs Provider (MD on SSD) stages') + string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_VMD_LABEL', + defaultValue: 'ci_vmd5', + description: 'Label to use for the Functional Hardware Medium VMD stage') string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_UCX_PROVIDER_LABEL', defaultValue: 'ci_ofed5', description: 'Label to use for 5 node Functional Hardware Medium UCX Provider stage') @@ -1183,6 +1189,19 @@ pipeline { run_if_landing: false, job_status: job_status_internal ), + 'Functional Hardware Medium VMD': getFunctionalTestStage( + name: 'Functional Hardware Medium VMD', + pragma_suffix: '-hw-medium-vmd', + label: params.FUNCTIONAL_HARDWARE_MEDIUM_VMD_LABEL, + next_version: next_version, + stage_tags: 'hw_vmd,medium', + /* groovylint-disable-next-line UnnecessaryGetter */ + default_tags: startedByTimer() ? 'pr daily_regression' : 'pr', + nvme: 'auto', + run_if_pr: false, + run_if_landing: false, + job_status: job_status_internal + ), 'Functional Hardware Medium Verbs Provider': getFunctionalTestStage( name: 'Functional Hardware Medium Verbs Provider', pragma_suffix: '-hw-medium-verbs-provider', diff --git a/TAG b/TAG index aefa26665a7..47d92ef13d9 100644 --- a/TAG +++ b/TAG @@ -1 +1 @@ -2.6.1-rc2 +2.6.1-rc3 diff --git a/debian/changelog b/debian/changelog index b9edca31485..6891cea4737 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +daos (2.6.1-3) unstable; urgency=medium + [ Phillip Henderson ] + * Third release candidate for 2.6.1 + + -- Phillip Henderson Tue, 01 Oct 2024 14:23:00 -0500 + daos (2.6.1-2) unstable; urgency=medium [ Phillip Henderson ] * Second release candidate for 2.6.1 diff --git a/docs/admin/administration.md b/docs/admin/administration.md index 1aeb7140305..1037fc6144c 100644 --- a/docs/admin/administration.md +++ b/docs/admin/administration.md @@ -620,21 +620,17 @@ Usage: [nvme-faulty command options] -u, --uuid= Device UUID to set -f, --force Do not require confirmation + -l, --host= Single host address to connect to ``` To manually evict an NVMe SSD (auto eviction is covered later in this section), the device state needs to be set faulty by running the following command: ```bash -$ dmg -l boro-11 storage set nvme-faulty --uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 +$ dmg storage set nvme-faulty --host=boro-11 --uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 NOTICE: This command will permanently mark the device as unusable! Are you sure you want to continue? (yes/no) yes -------- -boro-11 -------- - Devices - UUID:5bd91603-d3c7-4fb7-9a71-76bc25690c19 [TrAddr:] - Targets:[] Rank:0 State:EVICTED LED:ON +set-faulty operation performed successfully on the following host: boro-11:10001 ``` The device state will transition from "NORMAL" to "EVICTED" (shown above), during which time the faulty device reaction will have been triggered (all targets on the SSD will be rebuilt). @@ -693,19 +689,14 @@ Usage: [nvme command options] --old-uuid= Device UUID of hot-removed SSD --new-uuid= Device UUID of new device - --no-reint Bypass reintegration of device and just bring back online. + -l, --host= Single host address to connect to ``` To replace an NVMe SSD with an evicted device and reintegrate it into use with DAOS, run the following command: ```bash -$ dmg -l boro-11 storage replace nvme --old-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 --new-uuid=80c9f1be-84b9-4318-a1be-c416c96ca48b -------- -boro-11 -------- - Devices - UUID:80c9f1be-84b9-4318-a1be-c416c96ca48b [TrAddr:] - Targets:[] Rank:1 State:NORMAL LED:OFF +$ dmg storage replace nvme --host=boro-11 --old-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 --new-uuid=80c9f1be-84b9-4318-a1be-c416c96ca48b +dev-replace operation performed successfully on the following host: boro-11:10001 ``` The old, now replaced device will remain in an "EVICTED" state until it is unplugged. The new device will transition from a "NEW" state to a "NORMAL" state (shown above). @@ -716,14 +707,9 @@ In order to reuse a device that was previously set as FAULTY and evicted from th system, an admin can run the following command (setting the old device UUID to be the new device UUID): ```bash -$ dmg -l boro-11 storage replace nvme --old-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 --new-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 +$ dmg storage replace nvme --host=boro-11 ---old-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 --new-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 NOTICE: Attempting to reuse a previously set FAULTY device! -------- -boro-11 -------- - Devices - UUID:5bd91603-d3c7-4fb7-9a71-76bc25690c19 [TrAddr:] - Targets:[] Rank:1 State:NORMAL LED:OFF +dev-replace operation performed successfully on the following host: boro-11:10001 ``` The FAULTY device will transition from an "EVICTED" state back to a "NORMAL" state, and will again be available for use with DAOS. The use case of this command will mainly diff --git a/docs/release/release_notes.md b/docs/release/release_notes.md index aab836cdeef..2f95f111120 100644 --- a/docs/release/release_notes.md +++ b/docs/release/release_notes.md @@ -2,6 +2,57 @@ We are pleased to announce the release of DAOS version 2.6. +## DAOS Version 2.6.1 (2024-10-05) + +The DAOS 2.6.1 release contains the following updates on top of DAOS 2.6.0: + +* Mercury update for slingshot 11.0 host stack and other UCX provider fixes. + +### Bug fixes and improvements + +The DAOS 2.6.1 release includes fixes for several defects and a few changes +of administrator interface that can improve usability of DAOS system. + +* Fix a race between MS replica stepping up as leader and engines joining the + system, this race may cause engine join to fail. + +* Fix a race in concurrent container destroy which may cause engine crash. + +* Pool destroy returns explicit error instead of success if there is an + in-progress destroy against the same pool. + +* EC aggregation may cause inconsistency between data shard and parity shard, + this has been fixed in DAOS Version 2.6.1. + +* Enable pool list for clients. + +* Running "daos|dmg pool query-targets" with rank argument can query all + targets on that rank. + +* Add daos health check command which allows basic system health checks from client. + +* DAOS Version 2.6.0 always excludes unreachable engines reported by SWIM and schedule rebuild for + excluded engines, this is an overreaction if massive engines are impacted by power failure or + switch reboot because data recovery is impossible in these cases. DAOS 2.6.1 introduces a new + environment variable to set in the server yaml file for each engine (DAOS_POOL_RF) to indicate the + number of engine failures seen before stopping the changing of pool membership and completing in + progress rebuild. It will just let all I/O and on-going rebuild block. DAOS system can finish in + progress rebuild and be available again after bringing back impacted engines. The recommendation + is to set this environment variable to 2. + +* In DAOS Version 2.6.0, accessing faulty NVMe device returns wrong error code + to DAOS client which can fail the application. DAOS 2.6.1 returns correct + error code to DAOS client so the client can retry and eventually access data + in degraded mode instead of failing the I/O. + +* Pil4dfs fix to avoid deadlock with level zero library on aurora and support + for more libc functions that were not intercepted before + +For details, please refer to the Github +[release/2.6 commit history](https://github.com/daos-stack/daos/commits/release/2.6) +and the associated [Jira tickets](https://jira.daos.io/) as stated in the commit messages. + + ## DAOS Version 2.6.0 (2024-07-26) ### General Support diff --git a/src/bio/README.md b/src/bio/README.md index 1dcdae11c97..da83d995e91 100644 --- a/src/bio/README.md +++ b/src/bio/README.md @@ -209,7 +209,7 @@ Devices: - Manually Set Device State to FAULTY: **$dmg storage set nvme-faulty** ``` -$ dmg storage set nvme-faulty --uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 +$ dmg storage set nvme-faulty --host=localhost --uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 Devices UUID:9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 [TrAddr:0000:8d:00.0] Targets:[0] Rank:0 State:EVICTED @@ -219,7 +219,7 @@ Devices - Replace an evicted device with a new device: **$dmg storage replace nvme** ``` -$ dmg storage replace nvme --old-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 --new-uuid=8131fc39-4b1c-4662-bea1-734e728c434e +$ dmg storage replace nvme --host=localhost --old-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 --new-uuid=8131fc39-4b1c-4662-bea1-734e728c434e Devices UUID:8131fc39-4b1c-4662-bea1-734e728c434e [TrAddr:0000:8d:00.0] Targets:[0] Rank:0 State:NORMAL @@ -229,7 +229,7 @@ Devices - Reuse a previously evicted device: **$dmg storage replace nvme** ``` -$ dmg storage replace nvme --old-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 --new-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 +$ dmg storage replace nvme --host=localhost --old-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 --new-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 Devices UUID:9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 [TrAddr:0000:8a:00.0] Targets:[0] Rank:0 State:NORMAL diff --git a/src/bio/smd.pb-c.c b/src/bio/smd.pb-c.c index 720c0caa029..0be530f7e27 100644 --- a/src/bio/smd.pb-c.c +++ b/src/bio/smd.pb-c.c @@ -2120,69 +2120,39 @@ const ProtobufCMessageDescriptor ctl__led_manage_req__descriptor = (ProtobufCMessageInit) ctl__led_manage_req__init, NULL,NULL,NULL /* reserved[123] */ }; -static const ProtobufCFieldDescriptor ctl__dev_replace_req__field_descriptors[3] = -{ - { - "old_dev_uuid", - 1, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(Ctl__DevReplaceReq, old_dev_uuid), - NULL, - &protobuf_c_empty_string, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "new_dev_uuid", - 2, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(Ctl__DevReplaceReq, new_dev_uuid), - NULL, - &protobuf_c_empty_string, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "no_reint", - 3, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_BOOL, - 0, /* quantifier_offset */ - offsetof(Ctl__DevReplaceReq, no_reint), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, +static const ProtobufCFieldDescriptor ctl__dev_replace_req__field_descriptors[2] = { + { + "old_dev_uuid", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ + offsetof(Ctl__DevReplaceReq, old_dev_uuid), NULL, &protobuf_c_empty_string, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "new_dev_uuid", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ + offsetof(Ctl__DevReplaceReq, new_dev_uuid), NULL, &protobuf_c_empty_string, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, }; static const unsigned ctl__dev_replace_req__field_indices_by_name[] = { - 1, /* field[1] = new_dev_uuid */ - 2, /* field[2] = no_reint */ - 0, /* field[0] = old_dev_uuid */ -}; -static const ProtobufCIntRange ctl__dev_replace_req__number_ranges[1 + 1] = -{ - { 1, 0 }, - { 0, 3 } + 1, /* field[1] = new_dev_uuid */ + 0, /* field[0] = old_dev_uuid */ }; -const ProtobufCMessageDescriptor ctl__dev_replace_req__descriptor = -{ - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "ctl.DevReplaceReq", - "DevReplaceReq", - "Ctl__DevReplaceReq", - "ctl", - sizeof(Ctl__DevReplaceReq), - 3, - ctl__dev_replace_req__field_descriptors, - ctl__dev_replace_req__field_indices_by_name, - 1, ctl__dev_replace_req__number_ranges, - (ProtobufCMessageInit) ctl__dev_replace_req__init, - NULL,NULL,NULL /* reserved[123] */ +static const ProtobufCIntRange ctl__dev_replace_req__number_ranges[1 + 1] = {{1, 0}, {0, 2}}; +const ProtobufCMessageDescriptor ctl__dev_replace_req__descriptor = { + PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, + "ctl.DevReplaceReq", + "DevReplaceReq", + "Ctl__DevReplaceReq", + "ctl", + sizeof(Ctl__DevReplaceReq), + 2, + ctl__dev_replace_req__field_descriptors, + ctl__dev_replace_req__field_indices_by_name, + 1, + ctl__dev_replace_req__number_ranges, + (ProtobufCMessageInit)ctl__dev_replace_req__init, + NULL, + NULL, + NULL /* reserved[123] */ }; static const ProtobufCFieldDescriptor ctl__set_faulty_req__field_descriptors[1] = { diff --git a/src/bio/smd.pb-c.h b/src/bio/smd.pb-c.h index 7d197849f0e..06a15042abc 100644 --- a/src/bio/smd.pb-c.h +++ b/src/bio/smd.pb-c.h @@ -606,16 +606,13 @@ struct _Ctl__DevReplaceReq /* * UUID of new (hot-plugged) blobstore/device */ - char *new_dev_uuid; - /* - * Skip device reintegration if set - */ - protobuf_c_boolean no_reint; + char *new_dev_uuid; }; -#define CTL__DEV_REPLACE_REQ__INIT \ - { PROTOBUF_C_MESSAGE_INIT (&ctl__dev_replace_req__descriptor) \ - , (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0 } - +#define CTL__DEV_REPLACE_REQ__INIT \ + { \ + PROTOBUF_C_MESSAGE_INIT(&ctl__dev_replace_req__descriptor) \ + , (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string \ + } struct _Ctl__SetFaultyReq { diff --git a/src/client/dfuse/il/int_posix.c b/src/client/dfuse/il/int_posix.c index 7b281f82807..f29f5cee2ac 100644 --- a/src/client/dfuse/il/int_posix.c +++ b/src/client/dfuse/il/int_posix.c @@ -812,6 +812,7 @@ child_hdlr(void) DFUSE_LOG_WARNING("daos_eq_create() failed: "DF_RC, DP_RC(rc)); else ioil_iog.iog_main_eqh = ioil_eqh; + ioil_iog.iog_eq_count = 0; } /* Returns true on success */ diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index 6b068fbce72..fff49d9f0ae 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -945,6 +945,7 @@ child_hdlr(void) daos_dti_reset(); td_eqh = main_eqh = DAOS_HDL_INVAL; context_reset = true; + d_eq_count = 0; } /* only free the reserved low fds when application exits or encounters error */ diff --git a/src/client/java/hadoop-daos/pom.xml b/src/client/java/hadoop-daos/pom.xml index 7f8dac9f9f6..1bba3428580 100644 --- a/src/client/java/hadoop-daos/pom.xml +++ b/src/client/java/hadoop-daos/pom.xml @@ -15,7 +15,7 @@ jar - 3.3.6 + 3.4.0 ${project.basedir}/build ${project.basedir}/install diff --git a/src/common/tests_dmg_helpers.c b/src/common/tests_dmg_helpers.c index d4e569b7c95..f894b9f861d 100644 --- a/src/common/tests_dmg_helpers.c +++ b/src/common/tests_dmg_helpers.c @@ -1393,7 +1393,7 @@ dmg_storage_set_nvme_fault(const char *dmg_config_file, D_GOTO(out, rc = -DER_NOMEM); } - args = cmd_push_arg(args, &argcount, " --host-list=%s ", host); + args = cmd_push_arg(args, &argcount, " --host=%s ", host); if (args == NULL) D_GOTO(out, rc = -DER_NOMEM); diff --git a/src/control/cmd/dmg/json_test.go b/src/control/cmd/dmg/json_test.go index 8187f5b1078..9021f1c8d5d 100644 --- a/src/control/cmd/dmg/json_test.go +++ b/src/control/cmd/dmg/json_test.go @@ -76,10 +76,11 @@ func TestDmg_JsonOutput(t *testing.T) { testArgs = append(testArgs, "-l", "foo.com", "-a", test.MockPCIAddr(), "-e", "0") case "storage set nvme-faulty": - testArgs = append(testArgs, "--force", "-u", test.MockUUID()) + testArgs = append(testArgs, "--host", "foo.com", "--force", "-u", + test.MockUUID()) case "storage replace nvme": - testArgs = append(testArgs, "--old-uuid", test.MockUUID(), - "--new-uuid", test.MockUUID()) + testArgs = append(testArgs, "--host", "foo.com", "--old-uuid", + test.MockUUID(), "--new-uuid", test.MockUUID()) case "storage led identify", "storage led check", "storage led clear": testArgs = append(testArgs, test.MockUUID()) case "pool create": diff --git a/src/control/cmd/dmg/main.go b/src/control/cmd/dmg/main.go index fc6355e76f8..c88845a304c 100644 --- a/src/control/cmd/dmg/main.go +++ b/src/control/cmd/dmg/main.go @@ -41,8 +41,7 @@ type ( } singleHostCmd struct { - HostList singleHostFlag `short:"l" long:"host-list" default:"localhost" description:"Single host address to connect to"` - host string + Host singleHostFlag `short:"l" long:"host" required:"1" description:"Single host address to connect to"` } ctlInvoker interface { @@ -52,6 +51,25 @@ type ( ctlInvokerCmd struct { ctlInvoker control.Invoker } + + cmdLogger interface { + setLog(*logging.LeveledLogger) + } + + // cmdConfigSetter is an interface for setting the control config on a command + cmdConfigSetter interface { + setConfig(*control.Config) + } + + // cfgCmd is a structure that can be used by commands that need the control config. + cfgCmd struct { + config *control.Config + } + + baseCmd struct { + cmdutil.NoArgsCmd + cmdutil.LogCmd + } ) func (cmd *ctlInvokerCmd) setInvoker(c control.Invoker) { @@ -69,43 +87,8 @@ func (cmd *hostListCmd) setHostList(newList *hostlist.HostSet) { cmd.HostList.Replace(newList) } -func (cmd *singleHostCmd) getHostList() []string { - if cmd.host == "" { - if cmd.HostList.Count() == 0 { - cmd.host = "localhost" - } else { - cmd.host = cmd.HostList.Slice()[0] - } - } - return []string{cmd.host} -} - -func (cmd *singleHostCmd) setHostList(newList *hostlist.HostSet) { - cmd.HostList.Replace(newList) -} - -type cmdLogger interface { - setLog(*logging.LeveledLogger) -} - -type baseCmd struct { - cmdutil.NoArgsCmd - cmdutil.LogCmd -} - -// cmdConfigSetter is an interface for setting the control config on a command -type cmdConfigSetter interface { - setConfig(*control.Config) -} - -// cfgCmd is a structure that can be used by commands that need the control -// config. -type cfgCmd struct { - config *control.Config -} - -func (c *cfgCmd) setConfig(cfg *control.Config) { - c.config = cfg +func (cmd *cfgCmd) setConfig(cfg *control.Config) { + cmd.config = cfg } type cliOptions struct { diff --git a/src/control/cmd/dmg/storage_query.go b/src/control/cmd/dmg/storage_query.go index 2075350cebe..8604bb81100 100644 --- a/src/control/cmd/dmg/storage_query.go +++ b/src/control/cmd/dmg/storage_query.go @@ -40,11 +40,15 @@ type smdQueryCmd struct { func (cmd *smdQueryCmd) makeRequest(ctx context.Context, req *control.SmdQueryReq, opts ...pretty.PrintConfigOption) error { req.SetHostList(cmd.getHostList()) + cmd.Tracef("smd query request: %+v", req) + resp, err := control.SmdQuery(ctx, cmd.ctlInvoker, req) if err != nil { return err // control api returned an error, disregard response } + cmd.Tracef("smd query response: %+v", resp) + if cmd.JSONOutputEnabled() { return cmd.OutputJSON(resp, resp.Errors()) } @@ -155,13 +159,10 @@ func (cmd *usageQueryCmd) Execute(_ []string) error { type smdManageCmd struct { baseCmd ctlInvokerCmd - hostListCmd cmdutil.JSONOutputCmd } func (cmd *smdManageCmd) makeRequest(ctx context.Context, req *control.SmdManageReq, opts ...pretty.PrintConfigOption) error { - req.SetHostList(cmd.getHostList()) - cmd.Tracef("smd manage request: %+v", req) resp, err := control.SmdManage(ctx, cmd.ctlInvoker, req) @@ -169,7 +170,7 @@ func (cmd *smdManageCmd) makeRequest(ctx context.Context, req *control.SmdManage return err // control api returned an error, disregard response } - cmd.Tracef("smd managee response: %+v", resp) + cmd.Tracef("smd manage response: %+v", resp) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(resp, resp.Errors()) @@ -195,6 +196,7 @@ type setFaultyCmd struct { type nvmeSetFaultyCmd struct { smdManageCmd + singleHostCmd UUID string `short:"u" long:"uuid" description:"Device UUID to set" required:"1"` Force bool `short:"f" long:"force" description:"Do not require confirmation"` } @@ -213,6 +215,7 @@ func (cmd *nvmeSetFaultyCmd) Execute(_ []string) error { Operation: control.SetFaultyOp, IDs: cmd.UUID, } + req.SetHostList(cmd.Host.Slice()) return cmd.makeRequest(cmd.MustLogCtx(), req) } @@ -224,9 +227,9 @@ type storageReplaceCmd struct { // nvmeReplaceCmd is the struct representing the replace nvme storage subcommand type nvmeReplaceCmd struct { smdManageCmd + singleHostCmd OldDevUUID string `long:"old-uuid" description:"Device UUID of hot-removed SSD" required:"1"` NewDevUUID string `long:"new-uuid" description:"Device UUID of new device" required:"1"` - NoReint bool `long:"no-reint" description:"Bypass reintegration of device and just bring back online."` } // Execute is run when storageReplaceCmd activates @@ -236,23 +239,18 @@ func (cmd *nvmeReplaceCmd) Execute(_ []string) error { cmd.Notice("Attempting to reuse a previously set FAULTY device!") } - // TODO: Implement no-reint flag option - if cmd.NoReint { - return errors.New("NoReint is not currently implemented") - } - req := &control.SmdManageReq{ - Operation: control.DevReplaceOp, - IDs: cmd.OldDevUUID, - ReplaceUUID: cmd.NewDevUUID, - ReplaceNoReint: cmd.NoReint, + Operation: control.DevReplaceOp, + IDs: cmd.OldDevUUID, + ReplaceUUID: cmd.NewDevUUID, } + req.SetHostList(cmd.Host.Slice()) return cmd.makeRequest(cmd.MustLogCtx(), req) } type ledCmd struct { smdManageCmd - + hostListCmd Args struct { IDs string `positional-arg-name:"ids" description:"Comma-separated list of identifiers which could be either VMD backing device (NVMe SSD) PCI addresses or device UUIDs. All SSDs selected if arg not provided."` } `positional-args:"yes"` @@ -287,6 +285,7 @@ func (cmd *ledIdentifyCmd) Execute(_ []string) error { } req.Operation = control.LedResetOp } + req.SetHostList(cmd.getHostList()) return cmd.makeRequest(cmd.MustLogCtx(), req, pretty.PrintOnlyLEDInfo()) } @@ -305,5 +304,6 @@ func (cmd *ledCheckCmd) Execute(_ []string) error { Operation: control.LedCheckOp, IDs: cmd.Args.IDs, } + req.SetHostList(cmd.getHostList()) return cmd.makeRequest(cmd.MustLogCtx(), req, pretty.PrintOnlyLEDInfo()) } diff --git a/src/control/cmd/dmg/storage_query_test.go b/src/control/cmd/dmg/storage_query_test.go index d07685490ff..190ddaee217 100644 --- a/src/control/cmd/dmg/storage_query_test.go +++ b/src/control/cmd/dmg/storage_query_test.go @@ -122,64 +122,80 @@ func TestStorageQueryCommands(t *testing.T) { nil, }, { - "Set FAULTY device status (force)", + "Set FAULTY device status (missing host)", "storage set nvme-faulty --uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d -f", - printRequest(t, &control.SmdManageReq{ - Operation: control.SetFaultyOp, - IDs: "842c739b-86b5-462f-a7ba-b4a91b674f3d", - }), - nil, - }, - { - "Set FAULTY device status (without force)", - "storage set nvme-faulty --uuid abcd", - "StorageSetFaulty", - errors.New("consent not given"), + "", + errors.New("not specified"), }, { "Set FAULTY device status (with > 1 host)", "storage set nvme-faulty -l host-[1-2] -f --uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d", - "StorageSetFaulty", - errors.New("> 1 host"), + "", + errors.New("must specify a single host"), }, { - "Set FAULTY device status (with > 1 host) with legacy hostlist", - "-l host-[1-2] storage set nvme-faulty -f --uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d", - "StorageSetFaulty", - errors.New("> 1 host"), + "Set FAULTY device status (force)", + "storage set nvme-faulty --host foo --uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d -f", + printRequest(t, func() *control.SmdManageReq { + req := &control.SmdManageReq{ + Operation: control.SetFaultyOp, + IDs: "842c739b-86b5-462f-a7ba-b4a91b674f3d", + } + req.SetHostList([]string{"foo"}) + return req + }()), + nil, + }, + { + "Set FAULTY device status (without force)", + "storage set nvme-faulty --host foo --uuid abcd", + "", + errors.New("consent not given"), }, { "Set FAULTY device status without device specified", - "storage set nvme-faulty", - "StorageSetFaulty", + "storage set nvme-faulty --host foo", + "", errors.New("the required flag `-u, --uuid' was not specified"), }, { - "Reuse a FAULTY device", + "Reuse a FAULTY device (missing host)", "storage replace nvme --old-uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d --new-uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d", - printRequest(t, &control.SmdManageReq{ - Operation: control.DevReplaceOp, - IDs: "842c739b-86b5-462f-a7ba-b4a91b674f3d", - ReplaceUUID: "842c739b-86b5-462f-a7ba-b4a91b674f3d", - ReplaceNoReint: false, - }), + "", + errors.New("not specified"), + }, + { + "Reuse a FAULTY device", + "storage replace nvme --host foo --old-uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d --new-uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d", + printRequest(t, func() *control.SmdManageReq { + req := &control.SmdManageReq{ + Operation: control.DevReplaceOp, + IDs: "842c739b-86b5-462f-a7ba-b4a91b674f3d", + ReplaceUUID: "842c739b-86b5-462f-a7ba-b4a91b674f3d", + } + req.SetHostList([]string{"foo"}) + return req + }()), nil, }, { "Replace an evicted device with a new device", - "storage replace nvme --old-uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d --new-uuid 2ccb8afb-5d32-454e-86e3-762ec5dca7be", - printRequest(t, &control.SmdManageReq{ - Operation: control.DevReplaceOp, - IDs: "842c739b-86b5-462f-a7ba-b4a91b674f3d", - ReplaceUUID: "2ccb8afb-5d32-454e-86e3-762ec5dca7be", - ReplaceNoReint: false, - }), + "storage replace nvme --host foo --old-uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d --new-uuid 2ccb8afb-5d32-454e-86e3-762ec5dca7be", + printRequest(t, func() *control.SmdManageReq { + req := &control.SmdManageReq{ + Operation: control.DevReplaceOp, + IDs: "842c739b-86b5-462f-a7ba-b4a91b674f3d", + ReplaceUUID: "2ccb8afb-5d32-454e-86e3-762ec5dca7be", + } + req.SetHostList([]string{"foo"}) + return req + }()), nil, }, { "Try to replace a device without a new device UUID specified", - "storage replace nvme --old-uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d", - "StorageReplaceNvme", + "storage replace nvme -l foo --old-uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d", + "", errors.New("the required flag `--new-uuid' was not specified"), }, { diff --git a/src/control/cmd/dmg/telemetry.go b/src/control/cmd/dmg/telemetry.go index fd63164b438..40b84d1512f 100644 --- a/src/control/cmd/dmg/telemetry.go +++ b/src/control/cmd/dmg/telemetry.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -310,7 +310,7 @@ type metricsListCmd struct { // Execute runs the command to list metrics from the DAOS storage nodes. func (cmd *metricsListCmd) Execute(args []string) error { - host, err := getMetricsHost(cmd.getHostList()) + host, err := getMetricsHost(cmd.Host.Slice()) if err != nil { return err } @@ -365,7 +365,7 @@ type metricsQueryCmd struct { // Execute runs the command to query metrics from the DAOS storage nodes. func (cmd *metricsQueryCmd) Execute(args []string) error { - host, err := getMetricsHost(cmd.getHostList()) + host, err := getMetricsHost(cmd.Host.Slice()) if err != nil { return err } diff --git a/src/control/cmd/dmg/utils.go b/src/control/cmd/dmg/utils.go index 683b5e975ad..b8b97e43ff8 100644 --- a/src/control/cmd/dmg/utils.go +++ b/src/control/cmd/dmg/utils.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2018-2022 Intel Corporation. +// (C) Copyright 2018-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -19,6 +19,7 @@ import ( type singleHostFlag ui.HostSetFlag +// UnmarshalFlag implements the go-flags.Unmarshaler interface. func (shf *singleHostFlag) UnmarshalFlag(value string) error { if err := (*ui.HostSetFlag)(shf).UnmarshalFlag(value); err != nil { return err diff --git a/src/control/common/proto/ctl/smd.pb.go b/src/control/common/proto/ctl/smd.pb.go index 089c35cc7d4..66dedd0c4cc 100644 --- a/src/control/common/proto/ctl/smd.pb.go +++ b/src/control/common/proto/ctl/smd.pb.go @@ -1390,7 +1390,6 @@ type DevReplaceReq struct { OldDevUuid string `protobuf:"bytes,1,opt,name=old_dev_uuid,json=oldDevUuid,proto3" json:"old_dev_uuid,omitempty"` // UUID of old (hot-removed) blobstore/device NewDevUuid string `protobuf:"bytes,2,opt,name=new_dev_uuid,json=newDevUuid,proto3" json:"new_dev_uuid,omitempty"` // UUID of new (hot-plugged) blobstore/device - NoReint bool `protobuf:"varint,3,opt,name=no_reint,json=noReint,proto3" json:"no_reint,omitempty"` // Skip device reintegration if set } func (x *DevReplaceReq) Reset() { @@ -1439,13 +1438,6 @@ func (x *DevReplaceReq) GetNewDevUuid() string { return "" } -func (x *DevReplaceReq) GetNoReint() bool { - if x != nil { - return x.NoReint - } - return false -} - type SetFaultyReq struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -2314,61 +2306,60 @@ var file_ctl_smd_proto_rawDesc = []byte{ 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x08, 0x6c, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x2a, 0x0a, 0x11, 0x6c, 0x65, 0x64, 0x5f, 0x64, 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x6d, 0x69, 0x6e, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0f, 0x6c, 0x65, 0x64, 0x44, - 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x4d, 0x69, 0x6e, 0x73, 0x22, 0x6e, 0x0a, 0x0d, 0x44, + 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x4d, 0x69, 0x6e, 0x73, 0x22, 0x53, 0x0a, 0x0d, 0x44, 0x65, 0x76, 0x52, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x52, 0x65, 0x71, 0x12, 0x20, 0x0a, 0x0c, 0x6f, 0x6c, 0x64, 0x5f, 0x64, 0x65, 0x76, 0x5f, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x6f, 0x6c, 0x64, 0x44, 0x65, 0x76, 0x55, 0x75, 0x69, 0x64, 0x12, 0x20, 0x0a, 0x0c, 0x6e, 0x65, 0x77, 0x5f, 0x64, 0x65, 0x76, 0x5f, 0x75, 0x75, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x6e, 0x65, 0x77, 0x44, 0x65, 0x76, 0x55, 0x75, 0x69, 0x64, - 0x12, 0x19, 0x0a, 0x08, 0x6e, 0x6f, 0x5f, 0x72, 0x65, 0x69, 0x6e, 0x74, 0x18, 0x03, 0x20, 0x01, - 0x28, 0x08, 0x52, 0x07, 0x6e, 0x6f, 0x52, 0x65, 0x69, 0x6e, 0x74, 0x22, 0x22, 0x0a, 0x0c, 0x53, - 0x65, 0x74, 0x46, 0x61, 0x75, 0x6c, 0x74, 0x79, 0x52, 0x65, 0x71, 0x12, 0x12, 0x0a, 0x04, 0x75, - 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75, 0x69, 0x64, 0x22, - 0x4f, 0x0a, 0x0d, 0x44, 0x65, 0x76, 0x4d, 0x61, 0x6e, 0x61, 0x67, 0x65, 0x52, 0x65, 0x73, 0x70, - 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, - 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x26, 0x0a, 0x06, 0x64, 0x65, 0x76, 0x69, - 0x63, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0e, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x53, - 0x6d, 0x64, 0x44, 0x65, 0x76, 0x69, 0x63, 0x65, 0x52, 0x06, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, - 0x22, 0x98, 0x01, 0x0a, 0x0c, 0x53, 0x6d, 0x64, 0x4d, 0x61, 0x6e, 0x61, 0x67, 0x65, 0x52, 0x65, - 0x71, 0x12, 0x25, 0x0a, 0x03, 0x6c, 0x65, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x11, - 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4c, 0x65, 0x64, 0x4d, 0x61, 0x6e, 0x61, 0x67, 0x65, 0x52, 0x65, - 0x71, 0x48, 0x00, 0x52, 0x03, 0x6c, 0x65, 0x64, 0x12, 0x2e, 0x0a, 0x07, 0x72, 0x65, 0x70, 0x6c, - 0x61, 0x63, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, 0x2e, - 0x44, 0x65, 0x76, 0x52, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x52, 0x65, 0x71, 0x48, 0x00, 0x52, - 0x07, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x12, 0x2b, 0x0a, 0x06, 0x66, 0x61, 0x75, 0x6c, - 0x74, 0x79, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x11, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x53, - 0x65, 0x74, 0x46, 0x61, 0x75, 0x6c, 0x74, 0x79, 0x52, 0x65, 0x71, 0x48, 0x00, 0x52, 0x06, 0x66, - 0x61, 0x75, 0x6c, 0x74, 0x79, 0x42, 0x04, 0x0a, 0x02, 0x6f, 0x70, 0x22, 0xe1, 0x01, 0x0a, 0x0d, - 0x53, 0x6d, 0x64, 0x4d, 0x61, 0x6e, 0x61, 0x67, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x31, 0x0a, - 0x05, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x1b, 0x2e, 0x63, - 0x74, 0x6c, 0x2e, 0x53, 0x6d, 0x64, 0x4d, 0x61, 0x6e, 0x61, 0x67, 0x65, 0x52, 0x65, 0x73, 0x70, - 0x2e, 0x52, 0x61, 0x6e, 0x6b, 0x52, 0x65, 0x73, 0x70, 0x52, 0x05, 0x72, 0x61, 0x6e, 0x6b, 0x73, - 0x1a, 0x48, 0x0a, 0x06, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, - 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, - 0x75, 0x73, 0x12, 0x26, 0x0a, 0x06, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x18, 0x02, 0x20, 0x01, - 0x28, 0x0b, 0x32, 0x0e, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x53, 0x6d, 0x64, 0x44, 0x65, 0x76, 0x69, - 0x63, 0x65, 0x52, 0x06, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x1a, 0x53, 0x0a, 0x08, 0x52, 0x61, - 0x6e, 0x6b, 0x52, 0x65, 0x73, 0x70, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18, 0x01, - 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x33, 0x0a, 0x07, 0x72, 0x65, - 0x73, 0x75, 0x6c, 0x74, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x19, 0x2e, 0x63, 0x74, - 0x6c, 0x2e, 0x53, 0x6d, 0x64, 0x4d, 0x61, 0x6e, 0x61, 0x67, 0x65, 0x52, 0x65, 0x73, 0x70, 0x2e, - 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x07, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x73, 0x2a, - 0x4c, 0x0a, 0x0c, 0x4e, 0x76, 0x6d, 0x65, 0x44, 0x65, 0x76, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, - 0x0b, 0x0a, 0x07, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x0a, 0x0a, 0x06, - 0x4e, 0x4f, 0x52, 0x4d, 0x41, 0x4c, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x4e, 0x45, 0x57, 0x10, - 0x02, 0x12, 0x0b, 0x0a, 0x07, 0x45, 0x56, 0x49, 0x43, 0x54, 0x45, 0x44, 0x10, 0x03, 0x12, 0x0d, - 0x0a, 0x09, 0x55, 0x4e, 0x50, 0x4c, 0x55, 0x47, 0x47, 0x45, 0x44, 0x10, 0x04, 0x2a, 0x44, 0x0a, - 0x08, 0x4c, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x06, 0x0a, 0x02, 0x4e, 0x41, 0x10, - 0x00, 0x12, 0x0f, 0x0a, 0x0b, 0x51, 0x55, 0x49, 0x43, 0x4b, 0x5f, 0x42, 0x4c, 0x49, 0x4e, 0x4b, - 0x10, 0x01, 0x12, 0x06, 0x0a, 0x02, 0x4f, 0x4e, 0x10, 0x02, 0x12, 0x0e, 0x0a, 0x0a, 0x53, 0x4c, - 0x4f, 0x57, 0x5f, 0x42, 0x4c, 0x49, 0x4e, 0x4b, 0x10, 0x03, 0x12, 0x07, 0x0a, 0x03, 0x4f, 0x46, - 0x46, 0x10, 0x04, 0x2a, 0x28, 0x0a, 0x09, 0x4c, 0x65, 0x64, 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x12, 0x07, 0x0a, 0x03, 0x47, 0x45, 0x54, 0x10, 0x00, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x45, 0x54, - 0x10, 0x01, 0x12, 0x09, 0x0a, 0x05, 0x52, 0x45, 0x53, 0x45, 0x54, 0x10, 0x02, 0x42, 0x39, 0x5a, - 0x37, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, - 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, - 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x74, 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x22, 0x22, 0x0a, 0x0c, 0x53, 0x65, 0x74, 0x46, 0x61, 0x75, 0x6c, 0x74, 0x79, 0x52, 0x65, 0x71, + 0x12, 0x12, 0x0a, 0x04, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, + 0x75, 0x75, 0x69, 0x64, 0x22, 0x4f, 0x0a, 0x0d, 0x44, 0x65, 0x76, 0x4d, 0x61, 0x6e, 0x61, 0x67, + 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x26, 0x0a, + 0x06, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0e, 0x2e, + 0x63, 0x74, 0x6c, 0x2e, 0x53, 0x6d, 0x64, 0x44, 0x65, 0x76, 0x69, 0x63, 0x65, 0x52, 0x06, 0x64, + 0x65, 0x76, 0x69, 0x63, 0x65, 0x22, 0x98, 0x01, 0x0a, 0x0c, 0x53, 0x6d, 0x64, 0x4d, 0x61, 0x6e, + 0x61, 0x67, 0x65, 0x52, 0x65, 0x71, 0x12, 0x25, 0x0a, 0x03, 0x6c, 0x65, 0x64, 0x18, 0x01, 0x20, + 0x01, 0x28, 0x0b, 0x32, 0x11, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4c, 0x65, 0x64, 0x4d, 0x61, 0x6e, + 0x61, 0x67, 0x65, 0x52, 0x65, 0x71, 0x48, 0x00, 0x52, 0x03, 0x6c, 0x65, 0x64, 0x12, 0x2e, 0x0a, + 0x07, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, + 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x44, 0x65, 0x76, 0x52, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x52, + 0x65, 0x71, 0x48, 0x00, 0x52, 0x07, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x12, 0x2b, 0x0a, + 0x06, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x79, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x11, 0x2e, + 0x63, 0x74, 0x6c, 0x2e, 0x53, 0x65, 0x74, 0x46, 0x61, 0x75, 0x6c, 0x74, 0x79, 0x52, 0x65, 0x71, + 0x48, 0x00, 0x52, 0x06, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x79, 0x42, 0x04, 0x0a, 0x02, 0x6f, 0x70, + 0x22, 0xe1, 0x01, 0x0a, 0x0d, 0x53, 0x6d, 0x64, 0x4d, 0x61, 0x6e, 0x61, 0x67, 0x65, 0x52, 0x65, + 0x73, 0x70, 0x12, 0x31, 0x0a, 0x05, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, + 0x0b, 0x32, 0x1b, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x53, 0x6d, 0x64, 0x4d, 0x61, 0x6e, 0x61, 0x67, + 0x65, 0x52, 0x65, 0x73, 0x70, 0x2e, 0x52, 0x61, 0x6e, 0x6b, 0x52, 0x65, 0x73, 0x70, 0x52, 0x05, + 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x1a, 0x48, 0x0a, 0x06, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, + 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, + 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x26, 0x0a, 0x06, 0x64, 0x65, 0x76, 0x69, 0x63, + 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0e, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x53, 0x6d, + 0x64, 0x44, 0x65, 0x76, 0x69, 0x63, 0x65, 0x52, 0x06, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x1a, + 0x53, 0x0a, 0x08, 0x52, 0x61, 0x6e, 0x6b, 0x52, 0x65, 0x73, 0x70, 0x12, 0x12, 0x0a, 0x04, 0x72, + 0x61, 0x6e, 0x6b, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, + 0x33, 0x0a, 0x07, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, + 0x32, 0x19, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x53, 0x6d, 0x64, 0x4d, 0x61, 0x6e, 0x61, 0x67, 0x65, + 0x52, 0x65, 0x73, 0x70, 0x2e, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x07, 0x72, 0x65, 0x73, + 0x75, 0x6c, 0x74, 0x73, 0x2a, 0x4c, 0x0a, 0x0c, 0x4e, 0x76, 0x6d, 0x65, 0x44, 0x65, 0x76, 0x53, + 0x74, 0x61, 0x74, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, + 0x00, 0x12, 0x0a, 0x0a, 0x06, 0x4e, 0x4f, 0x52, 0x4d, 0x41, 0x4c, 0x10, 0x01, 0x12, 0x07, 0x0a, + 0x03, 0x4e, 0x45, 0x57, 0x10, 0x02, 0x12, 0x0b, 0x0a, 0x07, 0x45, 0x56, 0x49, 0x43, 0x54, 0x45, + 0x44, 0x10, 0x03, 0x12, 0x0d, 0x0a, 0x09, 0x55, 0x4e, 0x50, 0x4c, 0x55, 0x47, 0x47, 0x45, 0x44, + 0x10, 0x04, 0x2a, 0x44, 0x0a, 0x08, 0x4c, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x06, + 0x0a, 0x02, 0x4e, 0x41, 0x10, 0x00, 0x12, 0x0f, 0x0a, 0x0b, 0x51, 0x55, 0x49, 0x43, 0x4b, 0x5f, + 0x42, 0x4c, 0x49, 0x4e, 0x4b, 0x10, 0x01, 0x12, 0x06, 0x0a, 0x02, 0x4f, 0x4e, 0x10, 0x02, 0x12, + 0x0e, 0x0a, 0x0a, 0x53, 0x4c, 0x4f, 0x57, 0x5f, 0x42, 0x4c, 0x49, 0x4e, 0x4b, 0x10, 0x03, 0x12, + 0x07, 0x0a, 0x03, 0x4f, 0x46, 0x46, 0x10, 0x04, 0x2a, 0x28, 0x0a, 0x09, 0x4c, 0x65, 0x64, 0x41, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x07, 0x0a, 0x03, 0x47, 0x45, 0x54, 0x10, 0x00, 0x12, 0x07, + 0x0a, 0x03, 0x53, 0x45, 0x54, 0x10, 0x01, 0x12, 0x09, 0x0a, 0x05, 0x52, 0x45, 0x53, 0x45, 0x54, + 0x10, 0x02, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, + 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, + 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, + 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x74, 0x6c, 0x62, 0x06, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/src/control/lib/control/server_meta.go b/src/control/lib/control/server_meta.go index 246b0b18631..8b08cae15e2 100644 --- a/src/control/lib/control/server_meta.go +++ b/src/control/lib/control/server_meta.go @@ -80,7 +80,6 @@ type ( IDs string // comma separated list of IDs Rank ranklist.Rank ReplaceUUID string // For device replacement, UUID of new device - ReplaceNoReint bool // For device replacement, indicate no reintegration IdentifyTimeout uint32 // For LED identify, blink duration in minutes Operation SmdManageOpcode } @@ -306,7 +305,6 @@ func packPBSmdManageReq(req *SmdManageReq, pbReq *ctlpb.SmdManageReq) error { Replace: &ctlpb.DevReplaceReq{ OldDevUuid: req.IDs, NewDevUuid: req.ReplaceUUID, - NoReint: req.ReplaceNoReint, }, } case LedCheckOp: diff --git a/src/control/lib/control/server_meta_test.go b/src/control/lib/control/server_meta_test.go index 82ec3bd038c..affc59bd210 100644 --- a/src/control/lib/control/server_meta_test.go +++ b/src/control/lib/control/server_meta_test.go @@ -505,17 +505,15 @@ func TestControl_packPBSmdManageReq(t *testing.T) { }, "dev-replace": { req: &SmdManageReq{ - Operation: DevReplaceOp, - IDs: test.MockUUID(1), - ReplaceUUID: test.MockUUID(2), - ReplaceNoReint: true, + Operation: DevReplaceOp, + IDs: test.MockUUID(1), + ReplaceUUID: test.MockUUID(2), }, expPBReq: &ctlpb.SmdManageReq{ Op: &ctlpb.SmdManageReq_Replace{ Replace: &ctlpb.DevReplaceReq{ OldDevUuid: test.MockUUID(1), NewDevUuid: test.MockUUID(2), - NoReint: true, }, }, }, @@ -655,7 +653,6 @@ func TestControl_SmdManage(t *testing.T) { }, expErr: errors.New("> 1 host"), }, - // set-faulty API calls do not return SMD info. "set-faulty": { req: &SmdManageReq{ Operation: SetFaultyOp, diff --git a/src/control/server/config/server.go b/src/control/server/config/server.go index 1a12288011c..ec94784d7d5 100644 --- a/src/control/server/config/server.go +++ b/src/control/server/config/server.go @@ -482,6 +482,10 @@ func (cfg *Server) SetNrHugepages(log logging.Logger, mi *common.MemInfo) error msg = fmt.Sprintf("%s (MD-on-SSD)", msg) // MD-on-SSD has extra sys-xstream for rdb. sysXSCount++ + } else if ec.TargetCount == 1 { + // Avoid DMA buffer allocation failure with single target count by + // increasing the minimum target count used to calculate hugepages. + cfgTargetCount++ } } log.Debug(msg) @@ -513,9 +517,7 @@ func (cfg *Server) SetNrHugepages(log logging.Logger, mi *common.MemInfo) error cfg.NrHugepages = minHugepages log.Infof("hugepage count automatically set to %d (%s)", minHugepages, humanize.IBytes(hugePageBytes(minHugepages, mi.HugepageSizeKiB))) - } - - if cfg.NrHugepages < minHugepages { + } else if cfg.NrHugepages < minHugepages { log.Noticef("configured nr_hugepages %d is less than recommended %d, "+ "if this is not intentional update the 'nr_hugepages' config "+ "parameter or remove and it will be automatically calculated", diff --git a/src/control/server/config/server_test.go b/src/control/server/config/server_test.go index 0c11b358573..6961a0f4190 100644 --- a/src/control/server/config/server_test.go +++ b/src/control/server/config/server_test.go @@ -1043,7 +1043,61 @@ func TestServerConfig_SetNrHugepages(t *testing.T) { zeroHpSize: true, expErr: errors.New("invalid system hugepage size"), }, - "zero hugepages set in config; bdevs configured; implicit role assignment": { + "zero hugepages set in config; bdevs configured; single target count": { + extraConfig: func(c *Server) *Server { + return c.WithEngines(defaultEngineCfg(). + WithTargetCount(1). + WithStorage( + storage.NewTierConfig(). + WithStorageClass("dcpm"). + WithScmDeviceList("/dev/pmem1"), + storage.NewTierConfig(). + WithStorageClass("nvme"). + WithBdevDeviceList("0000:81:00.0"), + ), + defaultEngineCfg(). + WithTargetCount(1). + WithStorage( + storage.NewTierConfig(). + WithStorageClass("dcpm"). + WithScmDeviceList("/dev/pmem1"), + storage.NewTierConfig(). + WithStorageClass("nvme"). + WithBdevDeviceList("0000:d0:00.0"), + ), + ) + }, + expNrHugepages: 2048, + }, + "zero hugepages set in config; bdevs configured; single target count; md-on-ssd": { + extraConfig: func(c *Server) *Server { + return c.WithEngines(defaultEngineCfg(). + WithTargetCount(1). + WithStorage( + storage.NewTierConfig(). + WithStorageClass("ram"). + WithScmMountPoint("/foo"), + storage.NewTierConfig(). + WithStorageClass("nvme"). + WithBdevDeviceList("0000:81:00.0"). + WithBdevDeviceRoles(storage.BdevRoleAll), + ), + defaultEngineCfg(). + WithTargetCount(1). + WithStorage( + storage.NewTierConfig(). + WithStorageClass("ram"). + WithScmMountPoint("/foo"), + storage.NewTierConfig(). + WithStorageClass("nvme"). + WithBdevDeviceList("0000:d0:00.0"). + WithBdevDeviceRoles(storage.BdevRoleAll), + ), + ) + }, + expNrHugepages: 2048, + }, + "zero hugepages set in config; bdevs configured": { extraConfig: func(c *Server) *Server { return c.WithEngines(defaultEngineCfg(). WithStorage( diff --git a/src/control/server/storage/bdev_test.go b/src/control/server/storage/bdev_test.go index 296cb1ce9ea..90e4e2eeb01 100644 --- a/src/control/server/storage/bdev_test.go +++ b/src/control/server/storage/bdev_test.go @@ -344,21 +344,28 @@ func Test_CalcMinHugepages(t *testing.T) { }, expErr: errors.New("numTargets"), }, - "2KB pagesize; 16 targets": { + "2MiB pagesize; 1 target": { + input: &common.MemInfo{ + HugepageSizeKiB: 2048, + }, + numTargets: 1, + expPages: 512, + }, + "2MiB pagesize; 16 targets": { input: &common.MemInfo{ HugepageSizeKiB: 2048, }, numTargets: 16, expPages: 8192, }, - "2KB pagesize; 31 targets": { + "2MiB pagesize; 31 targets": { input: &common.MemInfo{ HugepageSizeKiB: 2048, }, numTargets: 31, expPages: 15872, }, - "1GB pagesize; 16 targets": { + "1GiB pagesize; 16 targets": { input: &common.MemInfo{ HugepageSizeKiB: 1048576, }, diff --git a/src/mgmt/smd.pb-c.c b/src/mgmt/smd.pb-c.c index 720c0caa029..0be530f7e27 100644 --- a/src/mgmt/smd.pb-c.c +++ b/src/mgmt/smd.pb-c.c @@ -2120,69 +2120,39 @@ const ProtobufCMessageDescriptor ctl__led_manage_req__descriptor = (ProtobufCMessageInit) ctl__led_manage_req__init, NULL,NULL,NULL /* reserved[123] */ }; -static const ProtobufCFieldDescriptor ctl__dev_replace_req__field_descriptors[3] = -{ - { - "old_dev_uuid", - 1, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(Ctl__DevReplaceReq, old_dev_uuid), - NULL, - &protobuf_c_empty_string, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "new_dev_uuid", - 2, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(Ctl__DevReplaceReq, new_dev_uuid), - NULL, - &protobuf_c_empty_string, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "no_reint", - 3, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_BOOL, - 0, /* quantifier_offset */ - offsetof(Ctl__DevReplaceReq, no_reint), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, +static const ProtobufCFieldDescriptor ctl__dev_replace_req__field_descriptors[2] = { + { + "old_dev_uuid", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ + offsetof(Ctl__DevReplaceReq, old_dev_uuid), NULL, &protobuf_c_empty_string, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "new_dev_uuid", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ + offsetof(Ctl__DevReplaceReq, new_dev_uuid), NULL, &protobuf_c_empty_string, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, }; static const unsigned ctl__dev_replace_req__field_indices_by_name[] = { - 1, /* field[1] = new_dev_uuid */ - 2, /* field[2] = no_reint */ - 0, /* field[0] = old_dev_uuid */ -}; -static const ProtobufCIntRange ctl__dev_replace_req__number_ranges[1 + 1] = -{ - { 1, 0 }, - { 0, 3 } + 1, /* field[1] = new_dev_uuid */ + 0, /* field[0] = old_dev_uuid */ }; -const ProtobufCMessageDescriptor ctl__dev_replace_req__descriptor = -{ - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "ctl.DevReplaceReq", - "DevReplaceReq", - "Ctl__DevReplaceReq", - "ctl", - sizeof(Ctl__DevReplaceReq), - 3, - ctl__dev_replace_req__field_descriptors, - ctl__dev_replace_req__field_indices_by_name, - 1, ctl__dev_replace_req__number_ranges, - (ProtobufCMessageInit) ctl__dev_replace_req__init, - NULL,NULL,NULL /* reserved[123] */ +static const ProtobufCIntRange ctl__dev_replace_req__number_ranges[1 + 1] = {{1, 0}, {0, 2}}; +const ProtobufCMessageDescriptor ctl__dev_replace_req__descriptor = { + PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, + "ctl.DevReplaceReq", + "DevReplaceReq", + "Ctl__DevReplaceReq", + "ctl", + sizeof(Ctl__DevReplaceReq), + 2, + ctl__dev_replace_req__field_descriptors, + ctl__dev_replace_req__field_indices_by_name, + 1, + ctl__dev_replace_req__number_ranges, + (ProtobufCMessageInit)ctl__dev_replace_req__init, + NULL, + NULL, + NULL /* reserved[123] */ }; static const ProtobufCFieldDescriptor ctl__set_faulty_req__field_descriptors[1] = { diff --git a/src/mgmt/smd.pb-c.h b/src/mgmt/smd.pb-c.h index 7d197849f0e..06a15042abc 100644 --- a/src/mgmt/smd.pb-c.h +++ b/src/mgmt/smd.pb-c.h @@ -606,16 +606,13 @@ struct _Ctl__DevReplaceReq /* * UUID of new (hot-plugged) blobstore/device */ - char *new_dev_uuid; - /* - * Skip device reintegration if set - */ - protobuf_c_boolean no_reint; + char *new_dev_uuid; }; -#define CTL__DEV_REPLACE_REQ__INIT \ - { PROTOBUF_C_MESSAGE_INIT (&ctl__dev_replace_req__descriptor) \ - , (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0 } - +#define CTL__DEV_REPLACE_REQ__INIT \ + { \ + PROTOBUF_C_MESSAGE_INIT(&ctl__dev_replace_req__descriptor) \ + , (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string \ + } struct _Ctl__SetFaultyReq { diff --git a/src/proto/ctl/smd.proto b/src/proto/ctl/smd.proto index 00c704e6a81..57cb414634a 100644 --- a/src/proto/ctl/smd.proto +++ b/src/proto/ctl/smd.proto @@ -198,8 +198,7 @@ message LedManageReq { message DevReplaceReq { string old_dev_uuid = 1; // UUID of old (hot-removed) blobstore/device - string new_dev_uuid = 2; // UUID of new (hot-plugged) blobstore/device - bool no_reint = 3; // Skip device reintegration if set + string new_dev_uuid = 2; // UUID of new (hot-plugged) blobstore/device } message SetFaultyReq { diff --git a/src/tests/ftest/aggregation/dfuse_space_check.py b/src/tests/ftest/aggregation/dfuse_space_check.py index 1d119f807a7..4bae72ef6c5 100644 --- a/src/tests/ftest/aggregation/dfuse_space_check.py +++ b/src/tests/ftest/aggregation/dfuse_space_check.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2022 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -9,6 +9,7 @@ from dfuse_utils import get_dfuse, start_dfuse from ior_test_base import IorTestBase +from run_utils import run_remote class DfuseSpaceCheck(IorTestBase): @@ -72,8 +73,11 @@ def write_multiple_files(self, dfuse): while self.get_nvme_free_space(False) >= self.block_size: file_path = os.path.join(dfuse.mount_dir.value, "file{}.txt".format(file_count)) write_dd_cmd = "dd if=/dev/zero of={} bs={} count=1".format(file_path, self.block_size) - if 0 in self.execute_cmd(write_dd_cmd, fail_on_err=True, display_output=False): - file_count += 1 + result = run_remote( + self.log, self.hostlist_clients, write_dd_cmd, verbose=False, timeout=300) + if not result.passed: + self.fail(f"Error running: {write_dd_cmd}") + file_count += 1 return file_count @@ -118,14 +122,16 @@ def test_dfusespacecheck(self): # Create a file as large as we can large_file = os.path.join(dfuse.mount_dir.value, 'largefile.txt') - self.execute_cmd('touch {}'.format(large_file)) + if not run_remote(self.log, self.hostlist_clients, f'touch {large_file}').passed: + self.fail(f"Error creating {large_file}") dd_count = (self.initial_space // self.block_size) + 1 write_dd_cmd = "dd if=/dev/zero of={} bs={} count={}".format( large_file, self.block_size, dd_count) - self.execute_cmd(write_dd_cmd, False) + run_remote(self.log, self.hostlist_clients, write_dd_cmd) # Remove the file - self.execute_cmd('rm -rf {}'.format(large_file)) + if not run_remote(self.log, self.hostlist_clients, f'rm -rf {large_file}').passed: + self.fail(f"Error removing {large_file}") # Wait for aggregation to complete self.wait_for_aggregation() @@ -142,7 +148,10 @@ def test_dfusespacecheck(self): self.pool.set_property("reclaim", "time") # remove all the small files created above. - self.execute_cmd("rm -rf {}".format(os.path.join(dfuse.mount_dir.value, '*'))) + result = run_remote( + self.log, self.hostlist_clients, f"rm -rf {os.path.join(dfuse.mount_dir.value, '*')}") + if not result.passed: + self.fail("Error removing files in mount dir") # Wait for aggregation to complete after file removal self.wait_for_aggregation() diff --git a/src/tests/ftest/control/dmg_scale.py b/src/tests/ftest/control/dmg_scale.py new file mode 100644 index 00000000000..5a268a798c7 --- /dev/null +++ b/src/tests/ftest/control/dmg_scale.py @@ -0,0 +1,182 @@ +""" + (C) Copyright 2024 Intel Corporation. + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +from apricot import TestWithServers +from telemetry_utils import TelemetryUtils +from test_utils_pool import time_pool_create + +ENGINE_POOL_METRICS_SHORT = [ + "engine_pool_entries_dtx_batched_degree", + "engine_pool_entries_dtx_batched_total", + "engine_pool_ops_akey_enum", + "engine_pool_ops_akey_punch", + "engine_pool_ops_compound", + "engine_pool_ops_dkey_enum", + "engine_pool_ops_dkey_punch", + "engine_pool_ops_dtx_abort", + "engine_pool_ops_dtx_check", + "engine_pool_ops_dtx_commit", + "engine_pool_ops_dtx_refresh", + "engine_pool_ops_ec_agg", + "engine_pool_ops_ec_rep", + "engine_pool_ops_fetch", + "engine_pool_ops_key_query", + "engine_pool_ops_migrate", + "engine_pool_ops_obj_enum", + "engine_pool_ops_obj_punch", + "engine_pool_ops_obj_sync", + "engine_pool_ops_recx_enum", + "engine_pool_ops_tgt_akey_punch", + "engine_pool_ops_tgt_dkey_punch", + "engine_pool_ops_tgt_punch", + "engine_pool_ops_tgt_update", + "engine_pool_ops_update", + "engine_pool_ops_pool_connect", + "engine_pool_ops_pool_disconnect", + "engine_pool_ops_pool_evict", + "engine_pool_ops_pool_query", + "engine_pool_ops_pool_query_space", + "engine_pool_resent", + "engine_pool_restarted", + "engine_pool_retry", + "engine_pool_scrubber_busy_time", + "engine_pool_scrubber_bytes_scrubbed_current", + "engine_pool_scrubber_bytes_scrubbed_prev", + "engine_pool_scrubber_bytes_scrubbed_total", + "engine_pool_scrubber_corruption_current", + "engine_pool_scrubber_corruption_total", + "engine_pool_scrubber_csums_current", + "engine_pool_scrubber_csums_prev", + "engine_pool_scrubber_csums_total", + "engine_pool_scrubber_next_csum_scrub", + "engine_pool_scrubber_next_tree_scrub", + "engine_pool_scrubber_prev_duration", + "engine_pool_scrubber_prev_duration_max", + "engine_pool_scrubber_prev_duration_mean", + "engine_pool_scrubber_prev_duration_min", + "engine_pool_scrubber_prev_duration_stddev", + "engine_pool_scrubber_scrubber_started", + "engine_pool_scrubber_scrubs_completed", + "engine_pool_started_at", + "engine_pool_vos_aggregation_akey_deleted", + "engine_pool_vos_aggregation_akey_scanned", + "engine_pool_vos_aggregation_akey_skipped", + "engine_pool_vos_aggregation_csum_errors", + "engine_pool_vos_aggregation_deleted_ev", + "engine_pool_vos_aggregation_deleted_sv", + "engine_pool_vos_aggregation_dkey_deleted", + "engine_pool_vos_aggregation_dkey_scanned", + "engine_pool_vos_aggregation_dkey_skipped", + "engine_pool_vos_aggregation_epr_duration", + "engine_pool_vos_aggregation_epr_duration_max", + "engine_pool_vos_aggregation_epr_duration_mean", + "engine_pool_vos_aggregation_epr_duration_min", + "engine_pool_vos_aggregation_epr_duration_stddev", + "engine_pool_vos_aggregation_merged_recs", + "engine_pool_vos_aggregation_merged_size", + "engine_pool_vos_aggregation_obj_deleted", + "engine_pool_vos_aggregation_obj_scanned", + "engine_pool_vos_aggregation_obj_skipped", + "engine_pool_vos_aggregation_uncommitted", + "engine_pool_vos_space_nvme_used", + "engine_pool_vos_space_scm_used", + "engine_pool_xferred_fetch", + "engine_pool_xferred_update", + "engine_pool_EC_update_full_stripe", + "engine_pool_EC_update_partial", + "engine_pool_block_allocator_alloc_hint", + "engine_pool_block_allocator_alloc_large", + "engine_pool_block_allocator_alloc_small", + "engine_pool_block_allocator_frags_aging", + "engine_pool_block_allocator_frags_large", + "engine_pool_block_allocator_frags_small", + "engine_pool_block_allocator_free_blks", + "engine_pool_ops_key2anchor" +] + + +class DmgScale(TestWithServers): + """Verify dmg commands works as expected in a large scale system. + + :avocado: recursive + """ + + def test_dmg_scale(self): + """Run the following steps and manually collect duration for each step. + + 0. Format storage + 1. System query + 2. Create a 100% pool that spans all engines + 3. Pool query + 4. Pool destroy + 5. Create 49 pools spanning all the engines with each pool using a 1/50th of the capacity + 6. Pool list + 7. Query around 80 pool metrics + 8. Destroy all 49 pools + 9. System stop + 10. System start + + Jira ID: DAOS-10508. + + :avocado: tags=all,manual + :avocado: tags=deployment + :avocado: tags=DmgScale,test_dmg_scale + """ + # This is a manual test and we need to find the durations from job.log, so add "##" to make + # it easy to search. The log is usually over 1 million lines. + self.log_step("## System query") + dmg_command = self.get_dmg_command() + dmg_command.system_query() + + self.log_step("## Create a 100% pool that spans all engines") + pool = self.get_pool(namespace="/run/pool_100/*", create=False) + duration = time_pool_create(log=self.log, number=1, pool=pool) + self.log.info("## Single pool create duration = %.1f", duration) + + self.log_step("## Pool query") + pool.query() + + self.log_step("## Pool destroy") + pool.destroy() + + quantity = self.params.get("quantity", "/run/pool_small/*", 1) + msg = (f"## Create {quantity} small pools spanning all the engines where the pools fill up " + f"the capacity") + self.log_step(msg) + pool_0 = self.get_pool(namespace="/run/pool_small/*", create=False) + duration_0 = time_pool_create(log=self.log, number=0, pool=pool_0) + pools = [pool_0] + durations = [duration_0] + for count in range(1, quantity): + pools.append(self.get_pool(create=False)) + # Use the SCM and NVMe size of the first pool for the rest of the (quantity - 1) pools. + pools[-1].scm_size.update(pool_0.scm_per_rank) + pools[-1].nvme_size.update(pool_0.nvme_per_rank) + durations.append(time_pool_create(log=self.log, number=count, pool=pools[-1])) + msg = (f"Pool {count} created. SCM = {pools[-1].scm_per_rank}; " + f"NVMe = {pools[-1].nvme_per_rank}") + self.log.info(msg) + self.log.info("## durations = %s", durations) + total_duration = sum(durations) + self.log.info("## %d pools create duration = %.1f", quantity, total_duration) + + self.log_step("## Pool list") + dmg_command.pool_list() + + self.log_step("## Query around 80 pool metrics") + # To save time and logs, call telemetry on the first host only. With the 80 pool metrics + # above, ~100K lines are printed per host. + telemetry_utils = TelemetryUtils( + dmg=dmg_command, servers=[self.server_managers[0].hosts[0]]) + telemetry_utils.get_metrics(name=",".join(ENGINE_POOL_METRICS_SHORT)) + + self.log_step(f"## Destroy all {quantity} pools") + self.destroy_pools(pools=pools) + + self.log_step("## System stop") + self.server_managers[0].system_stop() + + self.log_step("## System start") + self.server_managers[0].system_start() diff --git a/src/tests/ftest/control/dmg_scale.yaml b/src/tests/ftest/control/dmg_scale.yaml new file mode 100644 index 00000000000..84f4e35bc4d --- /dev/null +++ b/src/tests/ftest/control/dmg_scale.yaml @@ -0,0 +1,37 @@ +# Note: We usually use the extra yaml in aurora-tools, but that extra yaml has test_clients while +# this test doesn't need any client, so update the extra yaml or provide some dummy client to -tc. +hosts: + test_servers: 256 + +timeout: 900 + +daos_server: + pattern_timeout: 60 + +server_config: + name: daos_server + engines_per_host: 2 + engines: + 0: + pinned_numa_node: 0 + nr_xs_helpers: 1 + fabric_iface: ib0 + fabric_iface_port: 31317 + log_file: daos_server0.log + storage: auto + targets: 8 + 1: + pinned_numa_node: 1 + nr_xs_helpers: 1 + fabric_iface: ib1 + fabric_iface_port: 31417 + log_file: daos_server1.log + storage: auto + targets: 8 + +pool_100: + size: 100% +pool_small: + size: 2% + # If we use --size=2% during pool create, we can only create up to 49 pools. + quantity: 49 diff --git a/src/tests/ftest/control/dmg_storage_query.py b/src/tests/ftest/control/dmg_storage_query.py index 924709163b8..02f90dc8984 100644 --- a/src/tests/ftest/control/dmg_storage_query.py +++ b/src/tests/ftest/control/dmg_storage_query.py @@ -252,7 +252,8 @@ def test_dmg_storage_query_device_state(self): manager.update_expected_states(0, ["Errored"]) expect_failed_engine = True try: - self.dmg.storage_set_faulty(uuid=device['uuid']) + self.dmg.storage_set_faulty(host=device['hosts'].split(':')[0], + uuid=device['uuid']) except CommandFailure: if not expect_failed_engine: self.fail("Error setting the faulty state for {}".format(device['uuid'])) diff --git a/src/tests/ftest/daos_test/rebuild.py b/src/tests/ftest/daos_test/rebuild.py index 7cb87650c7a..62c66b7da21 100644 --- a/src/tests/ftest/daos_test/rebuild.py +++ b/src/tests/ftest/daos_test/rebuild.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -8,6 +8,7 @@ class DaosCoreTestRebuild(DaosCoreBase): + # pylint: disable=too-many-public-methods """Run just the daos_test rebuild tests. :avocado: recursive @@ -285,6 +286,70 @@ def test_rebuild_30(self): """ self.run_subtest() + def test_rebuild_31(self): + """Jira ID: DAOS-16027 + + Test Description: + Run daos_test -r -s5 -u subtests=31 + + Use cases: + Core tests for daos_test rebuild + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=unittest,rebuild + :avocado: tags=DaosCoreTestRebuild,daos_test,daos_core_test_rebuild,test_rebuild_31 + """ + self.run_subtest() + + def test_rebuild_32(self): + """Jira ID: DAOS-16027 + + Test Description: + Run daos_test -r -s5 -u subtests=32 + + Use cases: + Core tests for daos_test rebuild + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=unittest,rebuild + :avocado: tags=DaosCoreTestRebuild,daos_test,daos_core_test_rebuild,test_rebuild_32 + """ + self.run_subtest() + + def test_rebuild_33(self): + """Jira ID: DAOS-16027 + + Test Description: + Run daos_test -r -s5 -u subtests=33 + + Use cases: + Core tests for daos_test rebuild + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=unittest,rebuild + :avocado: tags=DaosCoreTestRebuild,daos_test,daos_core_test_rebuild,test_rebuild_33 + """ + self.run_subtest() + + def test_rebuild_34(self): + """Jira ID: DAOS-16027 + + Test Description: + Run daos_test -r -s5 -u subtests=34 + + Use cases: + Core tests for daos_test rebuild + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=unittest,rebuild + :avocado: tags=DaosCoreTestRebuild,daos_test,daos_core_test_rebuild,test_rebuild_34 + """ + self.run_subtest() + def test_rebuild_35(self): """Jira ID: DAOS-16027 diff --git a/src/tests/ftest/daos_test/rebuild.yaml b/src/tests/ftest/daos_test/rebuild.yaml index aae456d7df9..9ceb3c02afb 100644 --- a/src/tests/ftest/daos_test/rebuild.yaml +++ b/src/tests/ftest/daos_test/rebuild.yaml @@ -2,14 +2,20 @@ # required quantity is indicated by the placeholders hosts: test_servers: 4 + timeout: 800 timeouts: test_rebuild_0to10: 2000 test_rebuild_12to15: 1500 test_rebuild_27: 1500 + test_rebuild_31: 400 + test_rebuild_32: 400 + test_rebuild_33: 200 + test_rebuild_34: 200 test_rebuild_35: 180 pool: nvme_size: 0G + server_config: name: daos_server engines_per_host: 2 @@ -40,12 +46,15 @@ server_config: storage: auto transport_config: allow_insecure: false + agent_config: transport_config: allow_insecure: false + dmg: transport_config: allow_insecure: false + daos_tests: num_clients: 1 num_replicas: 1 @@ -67,6 +76,10 @@ daos_tests: test_rebuild_28: DAOS_Rebuild_28 test_rebuild_29: DAOS_Rebuild_29 test_rebuild_30: DAOS_Rebuild_30 + test_rebuild_31: DAOS_Rebuild_31 + test_rebuild_32: DAOS_Rebuild_32 + test_rebuild_33: DAOS_Rebuild_33 + test_rebuild_34: DAOS_Rebuild_34 test_rebuild_35: DAOS_Rebuild_35 daos_test: test_rebuild_0to10: r @@ -86,6 +99,10 @@ daos_tests: test_rebuild_28: r test_rebuild_29: r test_rebuild_30: r + test_rebuild_31: r + test_rebuild_32: r + test_rebuild_33: r + test_rebuild_34: r test_rebuild_35: r args: test_rebuild_0to10: -s3 -u subtests="0-10" @@ -105,6 +122,10 @@ daos_tests: test_rebuild_28: -s3 -u subtests="28" test_rebuild_29: -s5 -u subtests="29" test_rebuild_30: -s5 -u subtests="30" + test_rebuild_31: -s5 -u subtests="31" + test_rebuild_32: -s5 -u subtests="32" + test_rebuild_33: -s5 -u subtests="33" + test_rebuild_34: -s5 -u subtests="34" test_rebuild_35: -s5 -u subtests="35" stopped_ranks: test_rebuild_26: ["random"] diff --git a/src/tests/ftest/daos_test/suite.py b/src/tests/ftest/daos_test/suite.py index 632ab07cc71..f02d882f4ce 100644 --- a/src/tests/ftest/daos_test/suite.py +++ b/src/tests/ftest/daos_test/suite.py @@ -169,7 +169,7 @@ def test_daos_ec_io(self): :avocado: tags=all,pr,daily_regression :avocado: tags=hw,medium,provider,md_on_ssd :avocado: tags=daos_test,daos_core_test - :avocado: tags=DaosCoreTest,test_daos_io,test_daos_ec_io + :avocado: tags=DaosCoreTest,test_daos_ec_io """ self.run_subtest() @@ -185,7 +185,7 @@ def test_daos_ec_obj(self): :avocado: tags=all,pr,daily_regression :avocado: tags=hw,medium,provider,md_on_ssd :avocado: tags=daos_test,daos_core_test - :avocado: tags=DaosCoreTest,test_daos_io,test_daos_ec_obj + :avocado: tags=DaosCoreTest,test_daos_ec_obj """ self.run_subtest() diff --git a/src/tests/ftest/datamover/copy_procs.py b/src/tests/ftest/datamover/copy_procs.py index ce980f373f4..a1734659587 100644 --- a/src/tests/ftest/datamover/copy_procs.py +++ b/src/tests/ftest/datamover/copy_procs.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2022 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -46,7 +46,7 @@ def test_copy_procs(self): :avocado: tags=DmvrCopyProcs,test_copy_procs """ # Create pool and containers - pool1 = self.create_pool() + pool1 = self.get_pool() cont1 = self.get_container(pool1) cont2 = self.get_container(pool1) diff --git a/src/tests/ftest/datamover/dst_create.py b/src/tests/ftest/datamover/dst_create.py index a0a5f4bef27..379c152f763 100644 --- a/src/tests/ftest/datamover/dst_create.py +++ b/src/tests/ftest/datamover/dst_create.py @@ -58,8 +58,7 @@ def run_dm_dst_create(self, tool, cont_type, api, check_props): self.set_api(api) # Create 1 pool - pool1 = self.create_pool() - pool1.connect(2) + pool1 = self.get_pool() # Create a source cont cont1 = self.get_container(pool1, type=cont_type) @@ -98,8 +97,7 @@ def run_dm_dst_create(self, tool, cont_type, api, check_props): self.verify_cont(cont3, api, check_props, src_props) # Create another pool - pool2 = self.create_pool() - pool2.connect(2) + pool2 = self.get_pool() result = self.run_datamover( self.test_id + " cont1 to cont4 (different pool) (empty cont)", diff --git a/src/tests/ftest/datamover/large_dir.py b/src/tests/ftest/datamover/large_dir.py index f5e6c0e9aac..53187382007 100644 --- a/src/tests/ftest/datamover/large_dir.py +++ b/src/tests/ftest/datamover/large_dir.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2022 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -46,7 +46,7 @@ def run_dm_large_dir(self, tool): file_size = self.params.get("bytes", self.mdtest_cmd.namespace) # create pool and cont1 - pool = self.create_pool() + pool = self.get_pool() cont1 = self.get_container(pool) # run mdtest to create data in cont1 diff --git a/src/tests/ftest/datamover/large_file.py b/src/tests/ftest/datamover/large_file.py index 6fc9faf03e0..b962bdca376 100644 --- a/src/tests/ftest/datamover/large_file.py +++ b/src/tests/ftest/datamover/large_file.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -43,7 +43,7 @@ def run_dm_large_file(self, tool): self.fail("Failed to get ior processes for {}".format(self.tool)) # create pool and cont - pool = self.create_pool() + pool = self.get_pool() cont1 = self.get_container(pool) # create initial data in cont1 diff --git a/src/tests/ftest/datamover/negative.py b/src/tests/ftest/datamover/negative.py index 3b05e1c8dfd..67f6f2f53e5 100644 --- a/src/tests/ftest/datamover/negative.py +++ b/src/tests/ftest/datamover/negative.py @@ -33,13 +33,13 @@ def setUp(self): super().setUp() # Get the parameters - self.test_file = self.ior_cmd.test_file.value + test_file = self.ior_cmd.test_file.value # Setup the directory structures - self.new_posix_test_path() - self.posix_test_file = join(self.posix_local_test_paths[0], self.test_file) - self.daos_test_path = "/" - self.daos_test_file = join(self.daos_test_path, self.test_file) + self.__posix_test_path = self.new_posix_test_path() + self.__posix_test_file = join(self.__posix_test_path, test_file) + self.__daos_test_path = "/" + self.__daos_test_file = join(self.__daos_test_path, test_file) def test_dm_bad_params_dcp(self): """Jira ID: DAOS-5515 - Initial test case. @@ -65,7 +65,7 @@ def test_dm_bad_params_dcp(self): start_dfuse(self, dfuse) # Create a test pool - pool1 = self.create_pool() + pool1 = self.get_pool() # Create a special container to hold UNS entries uns_cont = self.get_container(pool1) @@ -75,27 +75,27 @@ def test_dm_bad_params_dcp(self): cont1 = self.get_container(pool1, path=cont1_path) # Create test files - self.run_ior_with_params("POSIX", self.posix_test_file) - self.run_ior_with_params("DAOS_UUID", self.daos_test_file, pool1, cont1) + self.run_ior_with_params("POSIX", self.__posix_test_file) + self.run_ior_with_params("DAOS_UUID", self.__daos_test_file, pool1, cont1) # Bad parameter: required arguments. self.run_datamover( self.test_id + " (missing source pool)", src_path=format_path(), - dst_path=self.posix_local_test_paths[0], + dst_path=self.__posix_test_path, expected_rc=1, expected_output=self.MFU_ERR_DAOS_INVAL_ARG) self.run_datamover( self.test_id + " (missing source cont)", src_path=format_path(pool1), - dst_path=self.posix_local_test_paths[0], + dst_path=self.__posix_test_path, expected_rc=1, expected_output=self.MFU_ERR_DAOS_INVAL_ARG) self.run_datamover( self.test_id + " (missing dest pool)", - src_path=self.posix_local_test_paths[0], + src_path=self.__posix_test_path, dst_path=format_path(), expected_rc=1, expected_output=self.MFU_ERR_DAOS_INVAL_ARG) @@ -134,20 +134,20 @@ def test_dm_bad_params_dcp(self): self.run_datamover( self.test_id + " (invalid source pool)", src_path=format_path(fake_uuid, cont1), - dst_path=self.posix_local_test_paths[0], + dst_path=self.__posix_test_path, expected_rc=1, expected_output="DER_NONEXIST") self.run_datamover( self.test_id + " (invalid source cont)", src_path=format_path(pool1, fake_uuid), - dst_path=self.posix_local_test_paths[0], + dst_path=self.__posix_test_path, expected_rc=1, expected_output="DER_NONEXIST") self.run_datamover( self.test_id + " (invalid dest pool)", - src_path=self.posix_local_test_paths[0], + src_path=self.__posix_test_path, dst_path=format_path(fake_uuid, cont1), expected_rc=1, expected_output="DER_NONEXIST") @@ -155,20 +155,20 @@ def test_dm_bad_params_dcp(self): self.run_datamover( self.test_id + " (invalid source cont path)", src_path=format_path(pool1, cont1, "/fake/fake"), - dst_path=self.posix_local_test_paths[0], + dst_path=self.__posix_test_path, expected_rc=1, expected_output="No such file or directory") self.run_datamover( self.test_id + " (invalid source cont UNS path)", src_path=cont1.path.value + "/fake/fake", - dst_path=self.posix_local_test_paths[0], + dst_path=self.__posix_test_path, expected_rc=1, expected_output="No such file or directory") self.run_datamover( self.test_id + " (invalid dest cont path)", - src_path=self.posix_local_test_paths[0], + src_path=self.__posix_test_path, dst_path=format_path(pool1, cont1, "/fake/fake"), expected_rc=1, expected_output="No such file or directory") @@ -188,7 +188,7 @@ def test_dm_bad_params_dcp(self): expected_output="No such file or directory") # (4) Bad parameter: destination filename is invalid. - dst_path = join(self.posix_local_test_paths[0], "d" * 300) + dst_path = join(self.__posix_test_path, "d" * 300) self.run_datamover( self.test_id + " (filename is too long)", src_path=format_path(pool1, cont1), @@ -215,7 +215,7 @@ def test_dm_bad_params_fs_copy(self): start_dfuse(self, dfuse) # Create a test pool - pool1 = self.create_pool() + pool1 = self.get_pool() # Create a special container to hold UNS entries uns_cont = self.get_container(pool1) @@ -225,7 +225,7 @@ def test_dm_bad_params_fs_copy(self): cont1 = self.get_container(pool1, path=cont1_path) # Create test files - self.run_ior_with_params("DAOS", self.daos_test_file, pool1, cont1) + self.run_ior_with_params("DAOS", self.__daos_test_file, pool1, cont1) # (1) Bad parameter: source is destination. self.log_step("Verify error when label source is label dest") diff --git a/src/tests/ftest/datamover/obj_large_posix.py b/src/tests/ftest/datamover/obj_large_posix.py index 87f252ea23e..f522e7a92df 100644 --- a/src/tests/ftest/datamover/obj_large_posix.py +++ b/src/tests/ftest/datamover/obj_large_posix.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2022 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -37,7 +37,7 @@ def run_dm_obj_large_posix(self, tool): file_size = self.params.get("bytes", "/run/mdtest/*") # Create pool1 and cont1 - pool1 = self.create_pool() + pool1 = self.get_pool() cont1 = self.get_container(pool1) # Create a large directory in cont1 diff --git a/src/tests/ftest/datamover/obj_small.py b/src/tests/ftest/datamover/obj_small.py index ed9ba5674b5..4e3a4d1fbb2 100644 --- a/src/tests/ftest/datamover/obj_small.py +++ b/src/tests/ftest/datamover/obj_small.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -58,8 +58,7 @@ def run_dm_obj_small(self, tool): self.set_tool(tool) # Create pool1 - pool1 = self.create_pool() - pool1.connect(2) + pool1 = self.get_pool() # Create cont1 cont1 = self.get_container(pool1) @@ -85,8 +84,7 @@ def run_dm_obj_small(self, tool): self.num_akeys_array, self.akey_sizes, self.akey_extents) # Create pool2 - pool2 = self.create_pool() - pool2.connect(2) + pool2 = self.get_pool() # Clone cont1 to a new cont3 in pool2 result = self.run_datamover( diff --git a/src/tests/ftest/datamover/posix_meta_entry.py b/src/tests/ftest/datamover/posix_meta_entry.py index bb608c27853..2e19a74d665 100644 --- a/src/tests/ftest/datamover/posix_meta_entry.py +++ b/src/tests/ftest/datamover/posix_meta_entry.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -7,7 +7,7 @@ from data_mover_test_base import DataMoverTestBase from dfuse_utils import get_dfuse, start_dfuse -from exception_utils import CommandFailure +from run_utils import run_remote class DmvrPosixMetaEntry(DataMoverTestBase): @@ -67,7 +67,7 @@ def run_dm_posix_meta_entry(self, tool): start_dfuse(self, dfuse) # Create 1 pool - pool1 = self.create_pool() + pool1 = self.get_pool() # Create 1 source container with test data cont1 = self.get_container(pool1) @@ -143,7 +143,9 @@ def create_data(self, path): "popd" ] - self.execute_cmd_list(cmd_list) + cmd = " &&\n".join(cmd_list) + if not run_remote(self.log, self.hostlist_clients, cmd, timeout=300).passed: + self.fail("Failed to create data in path") def compare_data(self, path1, path2, cmp_filetype=True, cmp_perms=True, cmp_owner=True, cmp_times=False, @@ -190,11 +192,9 @@ def compare_data(self, path1, path2, cmp_filetype=True, field_printf, entry2) diff_cmd = "diff <({} 2>&1) <({} 2>&1)".format( stat_cmd1, stat_cmd2) - result = self.execute_cmd(diff_cmd, fail_on_err=False) - if 0 not in result or len(result) > 1: - hosts = [str(nodes) for code, nodes in list(result.items()) if code != 0] - raise CommandFailure( - "Command to check files failed '{}' on {}".format(diff_cmd, hosts)) + result = run_remote(self.log, self.hostlist_clients, diff_cmd, timeout=300) + if not result.passed or not result.homogeneous: + self.fail(f"Unexpected diff between {entry1} and {entry2}") if cmp_xattr: # Use getfattr to get the xattrs @@ -202,13 +202,6 @@ def compare_data(self, path1, path2, cmp_filetype=True, xattr_cmd2 = "getfattr -d -h '{}'".format(entry2) diff_cmd = "diff -I '^#' <({} 2>&1) <({} 2>&1)".format( xattr_cmd1, xattr_cmd2) - self.execute_cmd(diff_cmd) - - def execute_cmd_list(self, cmd_list): - """Execute a list of commands, separated by &&. - - Args: - cmd_list (list): A list of commands to execute. - """ - cmd = " &&\n".join(cmd_list) - self.execute_cmd(cmd) + result = run_remote(self.log, self.hostlist_clients, diff_cmd, timeout=300) + if not result.passed or not result.homogeneous: + self.fail(f"Unexpected diff between {entry1} and {entry2}") diff --git a/src/tests/ftest/datamover/posix_preserve_props.py b/src/tests/ftest/datamover/posix_preserve_props.py index 91df7c11135..bc1e52ace71 100644 --- a/src/tests/ftest/datamover/posix_preserve_props.py +++ b/src/tests/ftest/datamover/posix_preserve_props.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -56,8 +56,7 @@ def run_dm_preserve_props(self, tool, cont_type, api): self.set_api(api) # Create 1 pool - pool1 = self.create_pool() - pool1.connect(2) + pool1 = self.get_pool() # set the path to read and write container properties self.preserve_props_path = join(self.tmp, "cont_props.h5") diff --git a/src/tests/ftest/datamover/posix_subsets.py b/src/tests/ftest/datamover/posix_subsets.py index 45e33d9cec9..fd14e0cf7b0 100644 --- a/src/tests/ftest/datamover/posix_subsets.py +++ b/src/tests/ftest/datamover/posix_subsets.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -54,7 +54,7 @@ def run_dm_posix_subsets(self, tool): start_dfuse(self, dfuse) # Create 1 pool - pool1 = self.create_pool() + pool1 = self.get_pool() # create dfuse containers to test copying to dfuse subdirectories dfuse_cont1 = self.get_container(pool1) diff --git a/src/tests/ftest/datamover/posix_symlinks.py b/src/tests/ftest/datamover/posix_symlinks.py index 68d60e4c973..47fd9ca25b7 100644 --- a/src/tests/ftest/datamover/posix_symlinks.py +++ b/src/tests/ftest/datamover/posix_symlinks.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -7,6 +7,7 @@ from data_mover_test_base import DataMoverTestBase from dfuse_utils import get_dfuse, start_dfuse +from run_utils import run_remote class DmvrPosixSymlinks(DataMoverTestBase): @@ -60,7 +61,7 @@ def run_dm_posix_symlinks(self, tool): start_dfuse(self, dfuse) # Create 1 pool - pool1 = self.create_pool() + pool1 = self.get_pool() # Create a special container to hold UNS entries uns_cont = self.get_container(pool1) @@ -119,8 +120,9 @@ def run_dm_posix_symlinks_fun(self, pool, cont, link_fun, link_desc): if do_deref: # Use POSIX cp to create a baseline for dereferencing deref_baseline_path = join(posix_test_path, "baseline_" + link_desc) - self.execute_cmd("cp -r --dereference '{}' '{}'".format( - src_posix_path, deref_baseline_path)) + cp_cmd = f"cp -r --dereference '{src_posix_path}' '{deref_baseline_path}'" + if not run_remote(self.log, self.hostlist_clients, cp_cmd, timeout=300).passed: + self.fail("Failed to create dereference baseline") diff_src = deref_baseline_path else: # Just compare against the original @@ -195,7 +197,9 @@ def create_links_forward(self, path): "popd" ] - self.execute_cmd_list(cmd_list) + cmd = " &&\n".join(cmd_list) + if not run_remote(self.log, self.hostlist_clients, cmd, timeout=300).passed: + self.fail(f"Failed to create forward symlinks in {path}") def create_links_backward(self, path): """ @@ -225,7 +229,9 @@ def create_links_backward(self, path): "popd" ] - self.execute_cmd_list(cmd_list) + cmd = " &&\n".join(cmd_list) + if not run_remote(self.log, self.hostlist_clients, cmd, timeout=300).passed: + self.fail(f"Failed to create backward symlinks in {path}") def create_links_mixed(self, path): """ @@ -256,12 +262,6 @@ def create_links_mixed(self, path): "popd" ] - self.execute_cmd_list(cmd_list) - - def execute_cmd_list(self, cmd_list): - """Execute a list of commands, separated by &&. - Args: - cmd_list (list): A list of commands to execute. - """ cmd = " &&\n".join(cmd_list) - self.execute_cmd(cmd) + if not run_remote(self.log, self.hostlist_clients, cmd, timeout=300).passed: + self.fail(f"Failed to create mixed symlinks in {path}") diff --git a/src/tests/ftest/datamover/posix_types.py b/src/tests/ftest/datamover/posix_types.py index 0ef85d018a8..79583bfb574 100644 --- a/src/tests/ftest/datamover/posix_types.py +++ b/src/tests/ftest/datamover/posix_types.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -68,8 +68,8 @@ def run_dm_posix_types(self, tool): start_dfuse(self, dfuse) # Create 2 pools - pool1 = self.create_pool(label='pool1') - pool2 = self.create_pool(label='pool2') + pool1 = self.get_pool(label='pool1') + pool2 = self.get_pool(label='pool2') # Create a special container to hold UNS entries uns_cont = self.get_container(pool1) diff --git a/src/tests/ftest/datamover/serial_large_posix.py b/src/tests/ftest/datamover/serial_large_posix.py index 6917097d901..0feb7253a8d 100644 --- a/src/tests/ftest/datamover/serial_large_posix.py +++ b/src/tests/ftest/datamover/serial_large_posix.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2022 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -43,7 +43,7 @@ def run_dm_serial_large_posix(self, tool): file_size = self.params.get("bytes", "/run/mdtest/*") # Create pool1 and cont1 - pool1 = self.create_pool() + pool1 = self.get_pool() cont1 = self.get_container(pool1) # Create a large directory in cont1 @@ -51,7 +51,7 @@ def run_dm_serial_large_posix(self, tool): self.run_mdtest_with_params("DAOS", "/", pool1, cont1, flags=mdtest_flags[0]) # Create pool2 - pool2 = self.create_pool() + pool2 = self.get_pool() # Use dfuse as a shared intermediate for serialize + deserialize dfuse_cont = self.get_container(pool1) diff --git a/src/tests/ftest/datamover/serial_small.py b/src/tests/ftest/datamover/serial_small.py index 28ce84bee35..75e91285959 100644 --- a/src/tests/ftest/datamover/serial_small.py +++ b/src/tests/ftest/datamover/serial_small.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2022 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -56,8 +56,7 @@ def run_dm_serial_small(self, tool): self.set_tool(tool) # Create pool1 - pool1 = self.create_pool() - pool1.connect(2) + pool1 = self.get_pool() # Create cont1 cont1 = self.get_container(pool1) @@ -69,8 +68,7 @@ def run_dm_serial_small(self, tool): self.num_akeys_array, self.akey_sizes, self.akey_extents) # Create pool2 - pool2 = self.create_pool() - pool2.connect(2) + pool2 = self.get_pool() # Serialize/Deserialize cont1 to a new cont2 in pool2 result = self.run_datamover( diff --git a/src/tests/ftest/deployment/basic_checkout.py b/src/tests/ftest/deployment/basic_checkout.py index 52a828e8329..216e89fd795 100644 --- a/src/tests/ftest/deployment/basic_checkout.py +++ b/src/tests/ftest/deployment/basic_checkout.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -120,7 +120,7 @@ def test_basic_checkout_dm(self): self.ior_ppn = self.ppn # create pool and container - pool = self.create_pool() + pool = self.get_pool() cont = self.get_container(pool, oclass=self.ior_cmd.dfs_oclass.value) # run datamover diff --git a/src/tests/ftest/deployment/basic_checkout.yaml b/src/tests/ftest/deployment/basic_checkout.yaml index 03d420ab82b..7ce9515bae8 100644 --- a/src/tests/ftest/deployment/basic_checkout.yaml +++ b/src/tests/ftest/deployment/basic_checkout.yaml @@ -70,7 +70,7 @@ mdtest_easy: &mdtest_easy_base write_bytes: 0 num_of_files_dirs: 100000000 stonewall_timer: 30 - stonewall_statusfile: "/var/tmp/daos_testing/stoneWallingStatusFile" + stonewall_statusfile: stoneWallingStatusFile dfs_destroy: false mdtest_dfs_s1: <<: *mdtest_easy_base diff --git a/src/tests/ftest/deployment/critical_integration.py b/src/tests/ftest/deployment/critical_integration.py index c8b9b296f3c..d1cf28ab555 100644 --- a/src/tests/ftest/deployment/critical_integration.py +++ b/src/tests/ftest/deployment/critical_integration.py @@ -11,7 +11,7 @@ from ClusterShell.NodeSet import NodeSet from exception_utils import CommandFailure from general_utils import DaosTestError, get_journalctl, journalctl_time, run_command -from ior_test_base import IorTestBase +from run_utils import run_remote # pylint: disable-next=fixme # TODO Provision all daos nodes using provisioning tool provided by HPCM @@ -67,7 +67,8 @@ def test_passwdlessssh_versioncheck(self): daos_server_version_list.append(out['response']['version']) if check_remote_root_access: run_command(remote_root_access) - IorTestBase._execute_command(self, command_for_inter_node, hosts=[host]) + if not run_remote(self.log, NodeSet(host), command_for_inter_node).passed: + self.fail(f"Inter-node clush failed on {host}") except (DaosTestError, CommandFailure, KeyError) as error: self.log.error("Error: %s", error) failed_nodes.add(host) diff --git a/src/tests/ftest/deployment/disk_failure.py b/src/tests/ftest/deployment/disk_failure.py index 23f2132171c..022498187a8 100644 --- a/src/tests/ftest/deployment/disk_failure.py +++ b/src/tests/ftest/deployment/disk_failure.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2022-2023 Intel Corporation. + (C) Copyright 2022-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -8,7 +8,6 @@ import time from avocado import fail_on -from ClusterShell.NodeSet import NodeSet from dmg_utils import get_dmg_response, get_storage_query_device_info from exception_utils import CommandFailure from general_utils import list_to_str @@ -67,14 +66,12 @@ def verify_disk_failure(self, num_pool): # Evict a random target from the system evict_device = random.choice(device_info) # nosec self.log.info("Evicting random target: %s", evict_device["uuid"]) - original_hostlist = self.dmg_command.hostlist try: - self.dmg_command.hostlist = evict_device["hosts"].split(":")[0] - get_dmg_response(self.dmg_command.storage_set_faulty, uuid=evict_device["uuid"]) + get_dmg_response(self.dmg_command.storage_set_faulty, + host=evict_device["hosts"].split(":")[0], + uuid=evict_device["uuid"]) except CommandFailure: self.fail("Error evicting target {}".format(evict_device["uuid"])) - finally: - self.dmg_command.hostlist = original_hostlist done = "Completed setting all devices to fault" self.print_and_assert_on_rebuild_failure(done) for thread in threads: @@ -82,16 +79,13 @@ def verify_disk_failure(self, num_pool): # Now replace the faulty NVME device. self.log.info("Replacing evicted target: %s", evict_device["uuid"]) - original_hostlist = self.dmg_command.hostlist try: - self.dmg_command.hostlist = evict_device["hosts"].split(":")[0] - get_dmg_response( - self.dmg_command.storage_replace_nvme, old_uuid=evict_device["uuid"], - new_uuid=evict_device["uuid"]) + get_dmg_response(self.dmg_command.storage_replace_nvme, + host=evict_device["hosts"].split(":")[0], + old_uuid=evict_device["uuid"], + new_uuid=evict_device["uuid"]) except CommandFailure as error: self.fail(str(error)) - finally: - self.dmg_command.hostlist = original_hostlist time.sleep(10) self.log.info( "Reintegrating evicted target: uuid=%s, rank=%s, targets=%s", @@ -119,7 +113,6 @@ def test_disk_failure_w_rf(self): Test disk failures during the IO operation. :avocado: tags=all,manual - :avocado: tags=hw,medium :avocado: tags=deployment,disk_failure :avocado: tags=DiskFailureTest,test_disk_failure_w_rf """ @@ -131,7 +124,6 @@ def test_disk_fault_to_normal(self): Test a disk inducing faults and resetting is back to normal state. :avocado: tags=all,manual - :avocado: tags=hw,medium :avocado: tags=deployment,disk_failure :avocado: tags=DiskFailureTest,test_disk_fault_to_normal """ @@ -142,13 +134,14 @@ def test_disk_fault_to_normal(self): for key in sorted(device): self.log.info(" %s: %s", key, device[key]) try: - self.dmg_command.hostlist = NodeSet(host) # Set the device as faulty - get_dmg_response(self.dmg_command.storage_set_faulty, uuid=device["uuid"]) + get_dmg_response(self.dmg_command.storage_set_faulty, host=host, + uuid=device["uuid"]) # Replace the device with same uuid. passed = False for _ in range(10): - data = self.dmg_command.storage_replace_nvme(old_uuid=device["uuid"], + data = self.dmg_command.storage_replace_nvme(host=host, + old_uuid=device["uuid"], new_uuid=device["uuid"]) if not data['error'] and len(data['response']['host_errors']) == 0: passed = True @@ -158,5 +151,3 @@ def test_disk_fault_to_normal(self): self.fail('Replacing faulty device did not pass after 10 retries') except CommandFailure as error: self.fail(str(error)) - finally: - self.dmg_command.hostlist = self.server_managers[0].hosts diff --git a/src/tests/ftest/deployment/io_sys_admin.py b/src/tests/ftest/deployment/io_sys_admin.py index bca8373ba5c..265c1ad42f3 100644 --- a/src/tests/ftest/deployment/io_sys_admin.py +++ b/src/tests/ftest/deployment/io_sys_admin.py @@ -40,66 +40,88 @@ def test_io_sys_admin(self): new_cont_user = self.params.get("user", "/run/container_set_owner/*") new_cont_group = self.params.get("group", "/run/container_set_owner/*") + # Toggle independent steps + steps_to_run = { + "pool_create_ownership": True, + "storage_system_query": True, + "io": True, + "snapshot": True, + "datamover": True + } + for step in steps_to_run: + run = self.params.get(step, "/run/io_sys_admin/steps_to_run/*", None) + if run is not None: + steps_to_run[step] = run + dmg = self.get_dmg_command() daos = self.get_daos_command() - for idx in range(1, 4): - pool = self.get_pool(namespace=f"/run/pool_{idx}/", create=False) - check_pool_creation(self, [pool], 60) - containers = [] - for cont_idx in range(1, 4): - containers.append( - self.get_container(pool, namespace=f"/run/container_{cont_idx}/")) - containers[-1].set_owner(f"{new_cont_user}@", f"{new_cont_group}@") - - daos.container_list(pool.identifier) - self.destroy_containers(containers) - pool.destroy() - - # dmg storage scan - dmg.storage_scan() - dmg.system_query() - dmg.system_leader_query() - - # write large data sets - self.run_file_count() - # create snapshot - self.container[-1].create_snap() - # overwrite the last ior file - self.ior_cmd.signature.update('456') - self.processes = self.ior_np - self.ppn = self.ior_ppn - self.run_ior_with_pool(create_pool=False, create_cont=False) - - nvme_free_space_before_snap_destroy = self.get_free_space()[1] - # delete snapshot - self.container[-1].destroy_snap(epc=self.container[-1].epoch) - # Now check if the space is returned back. - counter = 1 - returned_space = self.get_free_space()[1] - nvme_free_space_before_snap_destroy - - data_written = (int(self.ppn) * human_to_bytes(self.ior_cmd.block_size.value)) - while returned_space < int(data_written): - # try to wait for 4 x 60 secs for aggregation to be completed or - # else exit the test with a failure. - if counter > 4: - self.log.info("Free space before snapshot destroy: %s", - nvme_free_space_before_snap_destroy) - self.log.info("Free space when test terminated: %s", - self.get_free_space()[1]) - self.fail("Aggregation did not complete as expected") - - time.sleep(60) + if steps_to_run["pool_create_ownership"]: + self.log_step("Verify pool creation time and container set-owner") + for idx in range(1, 4): + pool = self.get_pool(namespace=f"/run/pool_{idx}/", create=False) + check_pool_creation(self, [pool], 60) + containers = [] + for cont_idx in range(1, 4): + containers.append( + self.get_container(pool, namespace=f"/run/container_{cont_idx}/")) + containers[-1].set_owner(f"{new_cont_user}@", f"{new_cont_group}@") + + daos.container_list(pool.identifier) + self.destroy_containers(containers) + pool.destroy() + + if steps_to_run["storage_system_query"]: + self.log_step("Verify storage scan and system query") + dmg.storage_scan() + dmg.system_query() + dmg.system_leader_query() + + if steps_to_run["io"]: + self.log_step("Verifying large dataset IO") + self.run_file_count() + + if steps_to_run["snapshot"]: + self.log_step("Verifying snapshot creation and aggregation") + self.container[-1].create_snap() + # overwrite the last ior file + self.ior_cmd.signature.update('456') + self.processes = self.ior_np + self.ppn = self.ior_ppn + self.run_ior_with_pool(create_pool=False, create_cont=False) + + nvme_free_space_before_snap_destroy = self.get_free_space()[1] + # delete snapshot + self.container[-1].destroy_snap(epc=self.container[-1].epoch) + # Now check if the space is returned back. + counter = 1 returned_space = self.get_free_space()[1] - nvme_free_space_before_snap_destroy - counter += 1 - - self.log.info("#####Starting FS_COPY Test") - self.run_dm_activities_with_ior("FS_COPY", self.pool, self.container[-1]) - self.log.info("#####Starting DCP Test") - self.run_dm_activities_with_ior("DCP", self.pool, self.container[-1]) - self.log.info("#####Starting DSERIAL Test") - self.run_dm_activities_with_ior("DSERIAL", self.pool, self.container[-1]) - self.log.info("#####Starting CONT_CLONE Test") - self.run_dm_activities_with_ior("CONT_CLONE", self.pool, self.container[-1]) - self.log.info("#####Completed all Datamover tests") - self.container.pop(0) + + data_written = (int(self.ppn) * human_to_bytes(self.ior_cmd.block_size.value)) + while returned_space < int(data_written): + # try to wait for 4 x 60 secs for aggregation to be completed or + # else exit the test with a failure. + if counter > 4: + self.log.info( + "Free space before snapshot destroy: %s", + nvme_free_space_before_snap_destroy) + self.log.info( + "Free space when test terminated: %s", self.get_free_space()[1]) + self.fail("Aggregation did not complete as expected") + + time.sleep(60) + returned_space = self.get_free_space()[1] - nvme_free_space_before_snap_destroy + counter += 1 + + if steps_to_run["datamover"]: + self.log_step("Verifying datamover") + self.log.info("#####Starting FS_COPY Test") + self.run_dm_activities_with_ior("FS_COPY", self.pool, self.container[-1]) + self.log.info("#####Starting DCP Test") + self.run_dm_activities_with_ior("DCP", self.pool, self.container[-1]) + self.log.info("#####Starting DSERIAL Test") + self.run_dm_activities_with_ior("DSERIAL", self.pool, self.container[-1]) + self.log.info("#####Starting CONT_CLONE Test") + self.run_dm_activities_with_ior("CONT_CLONE", self.pool, self.container[-1]) + self.log.info("#####Completed all Datamover tests") + self.container.pop(0) diff --git a/src/tests/ftest/deployment/io_sys_admin.yaml b/src/tests/ftest/deployment/io_sys_admin.yaml index 6c3edab15b3..f2a238ad4b5 100644 --- a/src/tests/ftest/deployment/io_sys_admin.yaml +++ b/src/tests/ftest/deployment/io_sys_admin.yaml @@ -104,3 +104,11 @@ dcp: np: 16 hdf5_vol: plugin_path: /usr/lib64/mpich/lib + +io_sys_admin: + steps_to_run: + pool_create_ownership: True + storage_system_query: True + io: True + snapshot: True + datamover: True diff --git a/src/tests/ftest/dfuse/sparse_file.py b/src/tests/ftest/dfuse/sparse_file.py index ef31c3816e6..484b787c38c 100644 --- a/src/tests/ftest/dfuse/sparse_file.py +++ b/src/tests/ftest/dfuse/sparse_file.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -11,6 +11,7 @@ from dfuse_utils import get_dfuse, start_dfuse from general_utils import get_remote_file_size from ior_test_base import IorTestBase +from run_utils import run_remote class SparseFile(IorTestBase): @@ -60,7 +61,8 @@ def test_sparsefile(self): # create large file and perform write to it so that if goes out of # space. sparse_file = os.path.join(dfuse.mount_dir.value, 'sparsefile.txt') - self.execute_cmd("touch {}".format(sparse_file)) + if not run_remote(self.log, self.hostlist_clients, f"touch {sparse_file}").passed: + self.fail(f"Failed to create {sparse_file}") self.log.info("File size (in bytes) before truncate: %s", get_remote_file_size(self.hostlist_clients[0], sparse_file)) @@ -84,7 +86,8 @@ def test_sparsefile(self): # write to the first byte of the file with char 'A' dd_first_byte = "echo 'A' | dd conv=notrunc of={} bs=1 count=1".format(sparse_file) - self.execute_cmd(dd_first_byte) + if not run_remote(self.log, self.hostlist_clients, dd_first_byte).passed: + self.fail(f"Failed to create first byte in {sparse_file}") fsize_write_1stbyte = get_remote_file_size(self.hostlist_clients[0], sparse_file) self.log.info("File size (in bytes) after writing first byte: %s", fsize_write_1stbyte) # verify file did not got overwritten after dd write. @@ -93,7 +96,8 @@ def test_sparsefile(self): # write to the 1024th byte position of the file dd_1024_byte = "echo 'A' | dd conv=notrunc of={} obs=1 seek=1023 bs=1 count=1".format( sparse_file) - self.execute_cmd(dd_1024_byte) + if not run_remote(self.log, self.hostlist_clients, dd_1024_byte).passed: + self.fail(f"Failed to create 1024th byte in {sparse_file}") fsize_write_1024thwrite = get_remote_file_size(self.hostlist_clients[0], sparse_file) self.log.info("File size (in bytes) after writing 1024th byte: %s", fsize_write_1024thwrite) # verify file did not got overwritten after dd write. @@ -110,13 +114,13 @@ def test_sparsefile(self): # check the middle 1022 bytes if they are filled with zeros middle_1022_bytes = "cmp --ignore-initial=1 --bytes=1022 {} {}".format( sparse_file, "/dev/zero") - self.execute_cmd(middle_1022_bytes) + if not run_remote(self.log, self.hostlist_clients, middle_1022_bytes).passed: + self.fail(f"Unexpected bytes in {sparse_file}") # read last 512 bytes which should be zeros till end of file. ignore_bytes = self.space_before - 512 read_till_eof = "cmp --ignore-initial={} {} {}".format( ignore_bytes, sparse_file, "/dev/zero") - # self.execute_cmd(read_till_eof, False) # fail the test if the above command is successful. - if 0 in self.execute_cmd(read_till_eof, False): + if run_remote(self.log, self.hostlist_clients, read_till_eof).passed: self.fail("read_till_eof command was supposed to fail. But it completed successfully.") diff --git a/src/tests/ftest/harness/basic.py b/src/tests/ftest/harness/basic.py index e0e39e15c6c..49759f4be09 100644 --- a/src/tests/ftest/harness/basic.py +++ b/src/tests/ftest/harness/basic.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2022-2023 Intel Corporation. + (C) Copyright 2022-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -52,7 +52,7 @@ def test_always_passes_hw(self): """Simple test of apricot test code. :avocado: tags=all - :avocado: tags=hw,medium,large + :avocado: tags=hw,hw_vmd,medium,large :avocado: tags=harness,harness_basic_test,always_passes :avocado: tags=HarnessBasicTest,test_always_passes_hw """ @@ -90,7 +90,7 @@ def test_load_mpi_hw(self): """Simple test of apricot test code to load the openmpi module. :avocado: tags=all - :avocado: tags=hw,medium,large + :avocado: tags=hw,hw_vmd,medium,large :avocado: tags=harness,harness_basic_test,load_mpi :avocado: tags=HarnessBasicTest,test_load_mpi_hw """ diff --git a/src/tests/ftest/harness/skip_list.py b/src/tests/ftest/harness/skip_list.py index 708ae12aa98..9bf08a6a154 100644 --- a/src/tests/ftest/harness/skip_list.py +++ b/src/tests/ftest/harness/skip_list.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2021 Intel Corporation. + (C) Copyright 2021-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -90,7 +90,7 @@ def test_case_1(self): :avocado: tags=all :avocado: tags=vm - :avocado: tags=harness,test_skips + :avocado: tags=harness,harness_skips :avocado: tags=TestHarnessSkipsSkipped,test_case_1 """ if not self.cancelled: @@ -101,7 +101,7 @@ def test_case_3(self): :avocado: tags=all :avocado: tags=vm - :avocado: tags=harness,test_skips + :avocado: tags=harness,harness_skips :avocado: tags=TestHarnessSkipsSkipped,test_case_3 """ if not self.cancelled: @@ -125,7 +125,7 @@ def test_case_2(self): :avocado: tags=all :avocado: tags=vm - :avocado: tags=harness,test_skips + :avocado: tags=harness,harness_skips :avocado: tags=TestHarnessSkipsRun,test_case_2 """ @@ -134,7 +134,7 @@ def test_case_4(self): :avocado: tags=all :avocado: tags=vm - :avocado: tags=harness,test_skips + :avocado: tags=harness,harness_skips :avocado: tags=TestHarnessSkipsRun,test_case_4 """ @@ -143,7 +143,7 @@ def test_case_5(self): :avocado: tags=all :avocado: tags=vm - :avocado: tags=harness,test_skips + :avocado: tags=harness,harness_skips :avocado: tags=TestHarnessSkipsRun,test_case_5 """ @@ -152,6 +152,6 @@ def test_case_6(self): :avocado: tags=all :avocado: tags=vm - :avocado: tags=harness,test_skips + :avocado: tags=harness,harness_skips :avocado: tags=TestHarnessSkipsRun,test_case_6 """ diff --git a/src/tests/ftest/io/large_file_count.yaml b/src/tests/ftest/io/large_file_count.yaml index 619143a83e8..6ff375cf3a9 100644 --- a/src/tests/ftest/io/large_file_count.yaml +++ b/src/tests/ftest/io/large_file_count.yaml @@ -44,6 +44,8 @@ ior: repetitions: 1 transfer_size: '1M' block_size: '7G' + env_vars: + - D_IL_REPORT=1 dfuse: disable_caching: true @@ -59,3 +61,5 @@ mdtest: write_bytes: 4096 read_bytes: 4096 depth: 0 + env_vars: + - D_IL_REPORT=1 diff --git a/src/tests/ftest/io/small_file_count.yaml b/src/tests/ftest/io/small_file_count.yaml index b9bf23cd126..79e02c3d787 100644 --- a/src/tests/ftest/io/small_file_count.yaml +++ b/src/tests/ftest/io/small_file_count.yaml @@ -45,6 +45,8 @@ ior: repetitions: 1 transfer_size: '1M' block_size: '2G' + env_vars: + - D_IL_REPORT=1 dfuse: disable_caching: true @@ -60,3 +62,5 @@ mdtest: write_bytes: 4096 read_bytes: 4096 depth: 0 + env_vars: + - D_IL_REPORT=1 diff --git a/src/tests/ftest/ior/small.yaml b/src/tests/ftest/ior/small.yaml index b0b21620a93..b638a396580 100644 --- a/src/tests/ftest/ior/small.yaml +++ b/src/tests/ftest/ior/small.yaml @@ -38,6 +38,8 @@ container: properties: cksum:crc16,cksum_size:16384,srv_cksum:on control_method: daos ior: + env_vars: + - D_IL_REPORT=1 ior_timeout: 75 client_processes: ppn: 32 diff --git a/src/tests/ftest/performance/ior_easy.yaml b/src/tests/ftest/performance/ior_easy.yaml index fca6fa3ba70..b846b179277 100644 --- a/src/tests/ftest/performance/ior_easy.yaml +++ b/src/tests/ftest/performance/ior_easy.yaml @@ -28,7 +28,7 @@ server_config: pool: size: 95% - properties: ec_cell_sz:128KiB + properties: ec_cell_sz:1MiB container: type: POSIX diff --git a/src/tests/ftest/performance/mdtest_easy.py b/src/tests/ftest/performance/mdtest_easy.py index 86db9f0c49d..c1a768694af 100644 --- a/src/tests/ftest/performance/mdtest_easy.py +++ b/src/tests/ftest/performance/mdtest_easy.py @@ -33,6 +33,15 @@ def test_performance_mdtest_easy_dfs_ec_16p2g1(self): """ self.run_performance_mdtest(namespace="/run/mdtest_dfs_ec_16p2g1/*") + def test_performance_mdtest_easy_dfs_rp_3g1(self): + """Test Description: Run MDTest Easy, DFS, RP_3G1. + + :avocado: tags=all,manual + :avocado: tags=performance + :avocado: tags=MdtestEasy,test_performance_mdtest_easy_dfs_rp_3g1 + """ + self.run_performance_mdtest(namespace="/run/mdtest_dfs_rp_3g1/*") + def test_performance_mdtest_easy_pil4dfs_s1(self): """Test Description: Run MDTest Easy, dfuse + pil4dfs, S1. @@ -51,3 +60,12 @@ def test_performance_mdtest_easy_pil4dfs_ec_16p2g1(self): :avocado: tags=MdtestEasy,test_performance_mdtest_easy_pil4dfs_ec_16p2g1 """ self.run_performance_mdtest(namespace="/run/mdtest_pil4dfs_ec_16p2g1/*") + + def test_performance_mdtest_easy_pil4dfs_rp_3g1(self): + """Test Description: Run MDTest Easy, dfuse + pil4dfs, RP_3G1. + + :avocado: tags=all,manual + :avocado: tags=performance + :avocado: tags=MdtestEasy,test_performance_mdtest_easy_pil4dfs_rp_3g1 + """ + self.run_performance_mdtest(namespace="/run/mdtest_pil4dfs_rp_3g1/*") diff --git a/src/tests/ftest/performance/mdtest_easy.yaml b/src/tests/ftest/performance/mdtest_easy.yaml index 8fdd27031c2..d2925536b79 100644 --- a/src/tests/ftest/performance/mdtest_easy.yaml +++ b/src/tests/ftest/performance/mdtest_easy.yaml @@ -28,7 +28,7 @@ server_config: pool: size: 95% - properties: ec_cell_sz:128KiB + properties: ec_cell_sz:1MiB container: type: POSIX @@ -46,7 +46,7 @@ mdtest: &mdtest_base write_bytes: 0 num_of_files_dirs: 100000000 stonewall_timer: 30 - stonewall_statusfile: "/var/tmp/daos_testing/stoneWallingStatusFile" + stonewall_statusfile: stoneWallingStatusFile dfs_destroy: false mdtest_s1: &mdtest_s1 @@ -61,6 +61,12 @@ mdtest_ec_16p2g1: &mdtest_ec_16p2g1 dfs_dir_oclass: RP_3GX dfs_chunk: 16MiB +mdtest_rp_3g1: &mdtest_rp_3g1 + <<: *mdtest_base + dfs_oclass: RP_3G1 + dfs_dir_oclass: RP_3GX + dfs_chunk: 1MiB + mdtest_dfs_s1: api: DFS <<: *mdtest_s1 @@ -69,6 +75,10 @@ mdtest_dfs_ec_16p2g1: api: DFS <<: *mdtest_ec_16p2g1 +mdtest_dfs_rp_3g1: + api: DFS + <<: *mdtest_rp_3g1 + mdtest_pil4dfs_s1: api: POSIX+PIL4DFS # handled by ftest <<: *mdtest_s1 @@ -77,6 +87,10 @@ mdtest_pil4dfs_ec_16p2g1: api: POSIX+PIL4DFS # handled by ftest <<: *mdtest_ec_16p2g1 +mdtest_pil4dfs_rp_3g1: + api: POSIX+PIL4DFS # handled by ftest + <<: *mdtest_rp_3g1 + dfuse: disable_caching: true diff --git a/src/tests/ftest/performance/mdtest_hard.py b/src/tests/ftest/performance/mdtest_hard.py index 2eebc5738a8..a1bf2ec3076 100644 --- a/src/tests/ftest/performance/mdtest_hard.py +++ b/src/tests/ftest/performance/mdtest_hard.py @@ -33,6 +33,15 @@ def test_performance_mdtest_hard_dfs_ec_16p2g1(self): """ self.run_performance_mdtest(namespace="/run/mdtest_dfs_ec_16p2g1/*") + def test_performance_mdtest_hard_dfs_rp_3g1(self): + """Test Description: Run MdTest Hard, DFS, RP_3G1. + + :avocado: tags=all,manual + :avocado: tags=performance + :avocado: tags=MdtestHard,test_performance_mdtest_hard_dfs_rp_3g1 + """ + self.run_performance_mdtest(namespace="/run/mdtest_dfs_rp_3g1/*") + def test_performance_mdtest_hard_pil4dfs_s1(self): """Test Description: Run MDTest Hard, dfuse + pil4dfs, S1. @@ -51,3 +60,12 @@ def test_performance_mdtest_hard_pil4dfs_ec_16p2g1(self): :avocado: tags=MdtestHard,test_performance_mdtest_hard_pil4dfs_ec_16p2g1 """ self.run_performance_mdtest(namespace="/run/mdtest_pil4dfs_ec_16p2g1/*") + + def test_performance_mdtest_hard_pil4dfs_rp_3g1(self): + """Test Description: Run MDTest Hard, dfuse + pil4dfs, RP_3G1. + + :avocado: tags=all,manual + :avocado: tags=performance + :avocado: tags=MdtestHard,test_performance_mdtest_hard_pil4dfs_rp_3g1 + """ + self.run_performance_mdtest(namespace="/run/mdtest_pil4dfs_rp_3g1/*") diff --git a/src/tests/ftest/performance/mdtest_hard.yaml b/src/tests/ftest/performance/mdtest_hard.yaml index ae3fcebaf5c..0599ea61319 100644 --- a/src/tests/ftest/performance/mdtest_hard.yaml +++ b/src/tests/ftest/performance/mdtest_hard.yaml @@ -28,8 +28,7 @@ server_config: pool: size: 95% - control_method: dmg - properties: ec_cell_sz:128KiB + properties: ec_cell_sz:1MiB container: type: POSIX @@ -47,7 +46,7 @@ mdtest: &mdtest_base write_bytes: 3901 num_of_files_dirs: 100000000 stonewall_timer: 30 - stonewall_statusfile: "/var/tmp/daos_testing/stoneWallingStatusFile" + stonewall_statusfile: stoneWallingStatusFile dfs_destroy: false mdtest_s1: &mdtest_s1 @@ -62,6 +61,12 @@ mdtest_ec_16p2g1: &mdtest_ec_16p2g1 dfs_dir_oclass: RP_3GX dfs_chunk: 16MiB +mdtest_rp_3g1: &mdtest_rp_3g1 + <<: *mdtest_base + dfs_oclass: RP_3G1 + dfs_dir_oclass: RP_3GX + dfs_chunk: 1MiB + mdtest_dfs_s1: api: DFS <<: *mdtest_s1 @@ -70,6 +75,10 @@ mdtest_dfs_ec_16p2g1: api: DFS <<: *mdtest_ec_16p2g1 +mdtest_dfs_rp_3g1: + api: DFS + <<: *mdtest_rp_3g1 + mdtest_pil4dfs_s1: api: POSIX+PIL4DFS # handled by ftest <<: *mdtest_s1 @@ -78,6 +87,13 @@ mdtest_pil4dfs_ec_16p2g1: api: POSIX+PIL4DFS # handled by ftest <<: *mdtest_ec_16p2g1 +mdtest_pil4dfs_rp_3g1: + api: POSIX+PIL4DFS # handled by ftest + <<: *mdtest_rp_3g1 + +dfuse: + disable_caching: true + client: env_vars: - D_LOG_MASK=INFO diff --git a/src/tests/ftest/recovery/ddb.py b/src/tests/ftest/recovery/ddb.py index 04df3184984..25e7223e0fa 100644 --- a/src/tests/ftest/recovery/ddb.py +++ b/src/tests/ftest/recovery/ddb.py @@ -90,12 +90,12 @@ def copy_remote_to_local(remote_file_path, test_dir, remote): # Use clush --rcopy to copy the file from the remote server node to the local test # node. clush will append . to the file when copying. args = "--rcopy {} --dest {}".format(remote_file_path, test_dir) - clush_command = get_clush_command(hosts=remote, args=args) + clush_command = get_clush_command(hosts=remote, args=args, timeout=60) try: - run_command(command=clush_command) + run_command(command=clush_command, timeout=None) except DaosTestError as error: - print("ERROR: Copying {} from {}: {}".format(remote_file_path, remote, error)) - raise error + raise DaosTestError( + f"ERROR: Copying {remote_file_path} from {remote}: {error}") from error # Remove the appended . from the copied file. current_file_path = "".join([remote_file_path, ".", remote]) @@ -103,10 +103,8 @@ def copy_remote_to_local(remote_file_path, test_dir, remote): try: run_command(command=mv_command) except DaosTestError as error: - print( - "ERROR: Moving {} to {}: {}".format( - current_file_path, remote_file_path, error)) - raise error + raise DaosTestError( + f"ERROR: Moving {current_file_path} to {remote_file_path}: {error}") from error class DdbTest(RecoveryTestBase): diff --git a/src/tests/ftest/server/multiengine_persocket.py b/src/tests/ftest/server/multiengine_persocket.py index 8c92fdfbdad..0431a9b7b2a 100644 --- a/src/tests/ftest/server/multiengine_persocket.py +++ b/src/tests/ftest/server/multiengine_persocket.py @@ -63,15 +63,12 @@ def verify_list_attr(self, indata, attributes_list): self.log.info(" list_attr size: %s", size) if length != size: - self.fail( - "FAIL: Size does not match for Names in list attr, Expected " - "len={} and received len={}".format(length, size)) + self.fail(f"Container attribute list size mismatch: expected {length}, received {size}") + # verify the Attributes names in list_attr retrieve for key in indata.keys(): if key.decode() not in attributes_list: - self.fail( - "FAIL: Name does not match after list attr, Expected " - "buf={} and received buf={}".format(key, attributes_list)) + self.fail(f"Unexpected container attribute received: {key}") def verify_get_attr(self, indata, outdata): """verify the Attributes value after get_attr. @@ -92,37 +89,29 @@ def verify_get_attr(self, indata, outdata): self.log.info(" set_attr data: %s", decoded) for attr, value in indata.items(): - if value != decoded.get(attr.decode(), None): + received = decoded.get(attr.decode(), None) + if value != received: self.fail( - "FAIL: Value does not match after get({}), Expected " - "val={} and received val={}".format(attr, value, - decoded.get(attr.decode(), None))) - - def daos_server_scm_reset(self, step): - """Perform daos_server scm reset. + f"Unexpected value for container attribute {attr}: expected {value}, " + f"received {received}") - Args: - step (str): test step. - """ + def daos_server_scm_reset(self): + """Perform daos_server scm reset.""" cmd = DaosServerCommand() cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("scm") cmd.sub_command_class.set_sub_command("reset") cmd.sub_command_class.sub_command_class.force.value = True - self.log.info( - "===(%s.A)Starting daos_server scm reset: %s", step, str(cmd)) + self.log_step("Resetting server PMem") results = run_remote(self.log, self.hostlist_servers, str(cmd), timeout=180) if not results.passed: - self.fail( - "#({0}.A){1} failed, " - "please make sure the server equipped with PMem modules".format(step, cmd)) + self.fail("Error resetting server PMem - ensure servers are equipped with PMem modules") - def daos_server_scm_prepare_ns(self, step, engines_per_socket=1): + def daos_server_scm_prepare_ns(self, engines_per_socket=1): """Perform daos_server scm prepare --scm-ns-per-socket. Args: - step (str): test step. engines_per_socket (int): number of engines per socket. """ cmd = DaosServerCommand() @@ -132,15 +121,10 @@ def daos_server_scm_prepare_ns(self, step, engines_per_socket=1): cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.scm_ns_per_socket.value = engines_per_socket cmd.sub_command_class.sub_command_class.force.value = True - - self.log.info( - "===(%s.B)Starting daos_server scm prepare -S: %s", step, str(cmd)) + self.log_step(f"Preparing server PMem for {engines_per_socket} engines per socket") results = run_remote(self.log, self.hostlist_servers, str(cmd), timeout=180) if not results.passed: - self.fail( - "#({0}.B){1} failed, " - "please make sure the server equipped with {2} PMem " - "modules.".format(step, cmd, engines_per_socket)) + self.fail(f"Error preparing server PMem for {engines_per_socket} engines per socket") def host_reboot(self, hosts): """To reboot the hosts. @@ -154,7 +138,7 @@ def host_reboot(self, hosts): if not wait_for_result(self.log, check_ping, 600, 5, True, host=hosts[0], expected_ping=False, cmd_timeout=60, verbose=True): - self.fail("Shutwown not detected within 600 seconds.") + self.fail("Shutdown not detected within 600 seconds.") if not wait_for_result(self.log, check_ping, 600, 5, True, host=hosts[0], expected_ping=True, cmd_timeout=60, verbose=True): self.fail("Reboot not detected within 600 seconds.") @@ -184,20 +168,9 @@ def storage_format(self): if not run_local(self.log, "dmg storage format").passed: self.fail("dmg storage format failed") - def cleanup(self): - """Servers clean up after test complete.""" - self.pool.destroy(recursive=1, force=1) - cleanup_cmds = [ - "sudo systemctl stop daos_server.service", - "sudo umount /mnt/daos*", - "sudo wipefs -a /dev/pmem*", - "/usr/bin/ls -l /dev/pmem*", - 'lsblk|grep -E "NAME|pmem"'] - for cmd in cleanup_cmds: - run_remote(self.log, self.hostlist_servers, cmd, timeout=90) - - def test_multiengines_per_socket(self): + def test_multi_engines_per_socket(self): """Test ID: DAOS-12076. + Test description: Test multiple engines/sockets. (1) Scm reset and prepare --scm-ns-per-socket (2) Start server @@ -207,112 +180,75 @@ def test_multiengines_per_socket(self): (6) Container create and attributes test (7) IOR test (8) MDTEST - (9) Cleanup + To launch test: (1) Make sure server is equipped with PMem - (2) ./launch.py test_multiengines_per_socket -ts -tc + (2) ./launch.py test_multi_engines_per_socket -ts -tc + :avocado: tags=manual :avocado: tags=server - :avocado: tags=MultiEnginesPerSocketTest,test_multiengines_per_socket + :avocado: tags=MultiEnginesPerSocketTest,test_multi_engines_per_socket """ - # (1) Scm reset and prepare --scm-ns-per-socket - step = 1 - self.log.info("===(%s)===Scm reset and prepare --scm-ns-per-socket", step) - engines_per_socket = self.params.get( - "engines_per_socket", "/run/server_config/*", default=1) - num_pmem = self.params.get( - "number_pmem", "/run/server_config/*", default=1) - self.daos_server_scm_reset(step) + server_namespace = "/run/server_config/*" + num_attributes = self.params.get("num_attributes", '/run/container/*') + _engines_per_socket = self.params.get("engines_per_socket", server_namespace, 1) + _num_pmem = self.params.get("number_pmem", server_namespace, 1) + + # Configure PMem for multiple engines per socket + self.daos_server_scm_reset() self.host_reboot(self.hostlist_servers) - self.daos_server_scm_prepare_ns(1.1, engines_per_socket) + self.daos_server_scm_prepare_ns(_engines_per_socket) self.host_reboot(self.hostlist_servers) - self.daos_server_scm_prepare_ns(1.2, engines_per_socket) + self.daos_server_scm_prepare_ns(_engines_per_socket) if not wait_for_result(self.log, self.check_pmem, 160, 1, False, - hosts=self.hostlist_servers, count=num_pmem): - self.fail("#{} pmem devices not found on all hosts.".format(num_pmem)) - self.storage_format() - - # (2) Start server - step += 1 - self.log.info("===(%s)===Start server", step) - start_server_cmds = [ - 'lsblk|grep -E "NAME|pmem"', - "sudo cp /etc/daos/daos_server.yml_4 /etc/daos/daos_server.yml", - "sudo systemctl start daos_server.service"] - for cmd in start_server_cmds: - results = run_remote(self.log, self.hostlist_servers, cmd, timeout=90) - # Check for server start status - if not results.passed: - self.fail("#Fail on {0}".format(cmd)) - - # (3) Start agent - step += 1 - self.log.info("===(%s)===Start agent", step) - start_agent_cmds = [ - "sudo systemctl start daos_agent.service", - "dmg storage scan", - "dmg network scan", - "dmg storage format", - "dmg storage query usage", - "dmg storage query list-devices", - "dmg system query"] - for cmd in start_agent_cmds: - results = run_remote(self.log, self.hostlist_clients, cmd, timeout=90) - # Check for agent start status - if not results.passed and "sudo systemctl" in cmd: - self.fail("#Fail on {0}".format(cmd)) - # (4) Dmg system query - step += 1 - self.log.info("===(%s)===Dmg system query", step) - # Delay is needed for multi ranks to show - query_cmds = [ - "dmg system query", - "dmg system query -v"] - for cmd in query_cmds: - results = run_remote(self.log, self.hostlist_clients, cmd, timeout=90) - - # (5) Pool create - step += 1 - self.log.info("===(%s)===Pool create", step) + hosts=self.hostlist_servers, count=_num_pmem): + self.fail(f"Error {_num_pmem} PMem devices not found on all hosts.") + + # Start servers + self.log_step("Starting servers") + run_remote(self.log, self.hostlist_servers, 'lsblk|grep -E "NAME|pmem"') + self.start_servers() + + # Start agents + self.log_step("Starting agents") + self.start_agents() + + # Run some dmg commands + self.log_step("Query the storage usage") + dmg = self.get_dmg_command() + # dmg.storage_query_usage() + dmg.storage_query_list_devices() + + # Create a pool + self.log_step("Create a pool") self.add_pool(connect=False) # (6) Container create and attributes test - step += 1 - self.log.info("===(%s)===Container create and attributes test", step) + self.log_step("Create a container and verify the attributes") self.add_container(self.pool) self.container.open() - num_attributes = self.params.get("num_attributes", '/run/attrtests/*') attr_dict = self.create_data_set(num_attributes) try: self.container.container.set_attr(data=attr_dict) data = self.container.list_attrs(verbose=False) self.verify_list_attr(attr_dict, data['response']) - data = self.container.list_attrs(verbose=True) self.verify_get_attr(attr_dict, data['response']) - except DaosApiError as excep: - self.log.info(excep) + except DaosApiError as error: + self.log.info(error) self.log.info(traceback.format_exc()) - self.fail("#Test was expected to pass but it failed.\n") + self.fail("Error setting and verify container attributes") self.container.close() self.pool.disconnect() # (7) IOR test - step += 1 - self.log.info("===(%s)===IOR test", step) + self.log_step("Run ior") ior_timeout = self.params.get("ior_timeout", '/run/ior/*') self.run_ior_with_pool( timeout=ior_timeout, create_pool=True, create_cont=True, stop_dfuse=True) # (8) MDTEST - step += 1 - self.log.info("===(%s)===MDTEST", step) + self.log_step("Run mdtest") mdtest_params = self.params.get("mdtest_params", "/run/mdtest/*") self.run_mdtest_multiple_variants(mdtest_params) - - # (9) Cleanup - step += 1 - self.log.info("===(%s)===Cleanup", step) - cmd = "dmg system query -v" - results = run_remote(self.log, self.hostlist_clients, cmd, timeout=90) - self.cleanup() + self.log.info("Test passed") diff --git a/src/tests/ftest/server/multiengine_persocket.yaml b/src/tests/ftest/server/multiengine_persocket.yaml index d183b2cb0dc..628f05273cd 100644 --- a/src/tests/ftest/server/multiengine_persocket.yaml +++ b/src/tests/ftest/server/multiengine_persocket.yaml @@ -1,104 +1,103 @@ hosts: test_servers: 1 test_clients: 1 + timeout: 930 + setup: start_agents: False start_servers: False start_agents_once: False start_servers_once: False + server_config: name: daos_server + provider: ofi+tcp reboot_waittime: 210 + engines_per_host: 4 engines_per_socket: 2 number_pmem: 4 engines: - - + 0: pinned_numa_node: 0 targets: 8 nr_xs_helpers: 0 # count of I/O offload threads per engine fabric_iface: eth0 fabric_iface_port: 31416 log_mask: ERR - log_file: /tmp/daos_engine.1.log + log_file: daos_engine.0.log env_vars: - FI_SOCKETS_MAX_CONN_RETRY=1 - FI_SOCKETS_CONN_TIMEOUT=2000 storage: - - + 0: class: dcpm scm_list: [/dev/pmem0] - scm_mount: /mnt/daos1 - - + scm_mount: /mnt/daos0 + 1: pinned_numa_node: 0 targets: 8 nr_xs_helpers: 0 # count of I/O offload threads per engine fabric_iface: eth0 fabric_iface_port: 32416 log_mask: ERR - log_file: /tmp/daos_engine.2.log + log_file: daos_engine.1.log env_vars: - FI_SOCKETS_MAX_CONN_RETRY=1 - FI_SOCKETS_CONN_TIMEOUT=2000 storage: - - + 0: class: dcpm scm_list: [/dev/pmem0.1] - scm_mount: /mnt/daos2 - - + scm_mount: /mnt/daos1 + 2: pinned_numa_node: 1 targets: 8 nr_xs_helpers: 0 # count of I/O offload threads per engine fabric_iface: eth0 fabric_iface_port: 33416 log_mask: ERR - log_file: /tmp/daos_engine.3.log + log_file: daos_engine.2.log env_vars: - FI_SOCKETS_MAX_CONN_RETRY=1 - FI_SOCKETS_CONN_TIMEOUT=2000 storage: - - + 0: class: dcpm scm_list: [/dev/pmem1] - scm_mount: /mnt/daos3 - - + scm_mount: /mnt/daos2 + 3: pinned_numa_node: 1 targets: 8 nr_xs_helpers: 0 # count of I/O offload threads per engine fabric_iface: eth0 fabric_iface_port: 34416 log_mask: ERR - log_file: /tmp/daos_engine.4.log + log_file: daos_engine.3.log env_vars: - FI_SOCKETS_MAX_CONN_RETRY=1 - FI_SOCKETS_CONN_TIMEOUT=2000 storage: - - + 0: class: dcpm scm_list: [/dev/pmem1.1] - scm_mount: /mnt/daos4 - transport_config: - allow_insecure: false -agent_config: - transport_config: - allow_insecure: false -dmg: - transport_config: - allow_insecure: false -provider: ofi+tcp + scm_mount: /mnt/daos3 + pool: control_method: dmg scm_size: 1G name: daos_server + container: control_method: daos type: POSIX properties: rf:0 -attrtests: num_attributes: 20 + dfuse: mount_dir: "/tmp/daos_dfuse1/" disable_caching: True + ior: ior_timeout: 120 client_processes: @@ -111,6 +110,7 @@ ior: iorflags: write_flg: "-w -W -k -G 1 -i 1" read_flg: "-C -k -e -r -R -g -G 1 -Q 1 -vv" + mdtest: client_processes: ppn: 8 diff --git a/src/tests/ftest/slurm_setup.py b/src/tests/ftest/slurm_setup.py index 0c3d300d5ff..00e95c6e128 100755 --- a/src/tests/ftest/slurm_setup.py +++ b/src/tests/ftest/slurm_setup.py @@ -145,8 +145,9 @@ def start_munge(self, user): non_control = self.nodes.difference(self.control) self.log.debug('Copying the munge key to %s', non_control) command = get_clush_command( - non_control, args=f"-B -S -v --copy {self.MUNGE_KEY} --dest {self.MUNGE_KEY}") - result = run_remote(self.log, self.control, command) + non_control, args=f"-B -S -v --copy {self.MUNGE_KEY} --dest {self.MUNGE_KEY}", + timeout=60) + result = run_remote(self.log, self.control, command, timeout=None) if not result.passed: raise SlurmSetupException(f'Error creating munge key on {result.failed_hosts}') diff --git a/src/tests/ftest/soak/soak-extra-mdonssd.yaml b/src/tests/ftest/soak/soak-extra-mdonssd.yaml new file mode 100644 index 00000000000..895e24848fd --- /dev/null +++ b/src/tests/ftest/soak/soak-extra-mdonssd.yaml @@ -0,0 +1,13 @@ +skip_add_log_msg: True +soak_stress: +ior_stress: + transfer_size: + - '1m' + - '4k' +datamover_stress: + ior_write: + block_size: + - '1M' + ior_read: + block_size: + - '1M' diff --git a/src/tests/ftest/tags.py b/src/tests/ftest/tags.py index 4158f53141f..ee35d91f295 100755 --- a/src/tests/ftest/tags.py +++ b/src/tests/ftest/tags.py @@ -17,6 +17,10 @@ THIS_FILE = os.path.realpath(__file__) FTEST_DIR = os.path.dirname(THIS_FILE) +MANUAL_TAG = ('manual',) +STAGE_TYPE_TAGS = ('vm', 'hw', 'hw_vmd') +STAGE_SIZE_TAGS = ('medium', 'large') +STAGE_FREQUENCY_TAGS = ('all', 'pr', 'daily_regression', 'full_regression') class LintFailure(Exception): @@ -254,7 +258,7 @@ def sorted_tags(tags): """ tags_tmp = set(tags) new_tags = [] - for tag in ('all', 'vm', 'hw', 'medium', 'large', 'pr', 'daily_regression', 'full_regression'): + for tag in STAGE_TYPE_TAGS + STAGE_SIZE_TAGS + STAGE_FREQUENCY_TAGS: if tag in tags_tmp: new_tags.append(tag) tags_tmp.remove(tag) @@ -279,11 +283,11 @@ def run_linter(paths=None, verbose=False): test_wo_tags = [] tests_wo_class_as_tag = [] tests_wo_method_as_tag = [] + test_w_invalid_test_tag = [] tests_wo_hw_vm_manual = [] tests_w_empty_tag = [] tests_wo_a_feature_tag = [] - non_feature_tags = set([ - 'all', 'vm', 'hw', 'medium', 'large', 'pr', 'daily_regression', 'full_regression']) + non_feature_tags = set(STAGE_TYPE_TAGS + STAGE_SIZE_TAGS + STAGE_FREQUENCY_TAGS) ftest_tag_map = FtestTagMap(paths) for file_path, classes in iter(ftest_tag_map): all_files.append(file_path) @@ -297,7 +301,11 @@ def run_linter(paths=None, verbose=False): tests_wo_class_as_tag.append(method_name) if method_name not in tags: tests_wo_method_as_tag.append(method_name) - if not set(tags).intersection(set(['vm', 'hw', 'manual'])): + for _tag in tags: + if _tag.startswith('test_') and _tag != method_name: + test_w_invalid_test_tag.append(method_name) + break + if not set(tags).intersection(set(MANUAL_TAG + STAGE_TYPE_TAGS)): tests_wo_hw_vm_manual.append(method_name) if '' in tags: tests_w_empty_tag.append(method_name) @@ -338,6 +346,7 @@ def _error_handler(_list, message, required=True): _error_handler(test_wo_tags, 'tests without tags'), _error_handler(tests_wo_class_as_tag, 'tests without class as tag'), _error_handler(tests_wo_method_as_tag, 'tests without method name as tag'), + _error_handler(test_w_invalid_test_tag, 'tests with invalid test_ tag'), _error_handler(tests_wo_hw_vm_manual, 'tests without HW, VM, or manual tag'), _error_handler(tests_w_empty_tag, 'tests with an empty tag'), _error_handler(tests_wo_a_feature_tag, 'tests without a feature tag')])) diff --git a/src/tests/ftest/telemetry/wal_metrics.py b/src/tests/ftest/telemetry/wal_metrics.py index 105015aaf29..ee3f85ff2d7 100644 --- a/src/tests/ftest/telemetry/wal_metrics.py +++ b/src/tests/ftest/telemetry/wal_metrics.py @@ -200,8 +200,8 @@ def test_wal_checkpoint_metrics(self): # Check point dirty chunks should be 1-300 ranges[metric][label] = [1, 300] elif '_dirty_pages' in metric: - # Check point dirty pages should be 1-3 - ranges[metric][label] = [1, 3] + # Check point dirty pages should be 1-30 + ranges[metric][label] = [1, 30] elif '_duration' in metric: # Check point duration should be 1-2,000,000 ranges[metric][label] = [1, 2000000] diff --git a/src/tests/ftest/util/data_mover_test_base.py b/src/tests/ftest/util/data_mover_test_base.py index 669a720228c..db272febc94 100644 --- a/src/tests/ftest/util/data_mover_test_base.py +++ b/src/tests/ftest/util/data_mover_test_base.py @@ -113,7 +113,6 @@ def __init__(self, *args, **kwargs): self.ddeserialize_cmd = None self.fs_copy_cmd = None self.cont_clone_cmd = None - self.pool = [] self.dfuse_hosts = None self.num_run_datamover = 0 # Number of times run_datamover was called @@ -141,12 +140,6 @@ def __init__(self, *args, **kwargs): self.preserve_props_path = None - # List of local test paths to create and remove - self.posix_local_test_paths = [] - - # List of daos test paths to keep track of - self.daos_test_paths = [] - def setUp(self): """Set up each test case.""" # Start the servers and agents @@ -222,15 +215,10 @@ def new_posix_test_path(self, shared=False, create=True, parent=None, mount_dir_ str: the posix path. """ - # make directory name unique to datamover test - method = self.get_test_name() - dir_name = "{}{}".format(method, len(self.posix_local_test_paths)) + # make directory name unique to this test + dir_name = self.label_generator.get_label(self.get_test_name()) path = join(parent or self.posix_root.value, dir_name) - # Add to the list of posix paths - if not shared: - self.posix_local_test_paths.append(path) - if create: # Create the directory cmd = f"mkdir -p '{path}'" @@ -271,18 +259,16 @@ def new_daos_test_path(self, create=True, cont=None, parent="/"): str: the path relative to the root of the container. """ - dir_name = "daos_test{}".format(len(self.daos_test_paths)) + dir_name = self.label_generator.get_label('daos_test_dir') path = join(parent, dir_name) - # Add to the list of daos paths - self.daos_test_paths.append(path) - if create: if not cont or not cont.path: self.fail("Container path required to create directory.") # Create the directory relative to the container path - cmd = "mkdir -p '{}'".format(cont.path.value + path) - self.execute_cmd(cmd) + full_path = cont.path.value + path + if not run_remote(self.log, self.hostlist_clients, f"mkdir -p '{full_path}'").passed: + self.fail(f"Failed to mkdir {full_path}") return path @@ -306,20 +292,6 @@ def _validate_param_type(self, param_type): self.fail("Invalid param_type: {}".format(_type)) return None - def create_pool(self, **params): - """Create a TestPool object and adds to self.pool. - - Returns: - TestPool: the created pool - - """ - pool = self.get_pool(connect=False, **params) - - # Save the pool - self.pool.append(pool) - - return pool - def parse_create_cont_label(self, output): """Parse a uuid or label from create container output. @@ -830,7 +802,8 @@ def run_diff(self, src, dst, deref=False): cmd = "diff -r {} '{}' '{}'".format( deref_str, src, dst) - self.execute_cmd(cmd) + if not run_remote(self.log, self.hostlist_clients, cmd, timeout=300).passed: + self.fail(f"Unexpected diff between {src} and {dst}") # pylint: disable=too-many-arguments def run_datamover(self, test_desc=None, diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py index 34e21f66d0a..effc3172bac 100644 --- a/src/tests/ftest/util/dmg_utils.py +++ b/src/tests/ftest/util/dmg_utils.py @@ -252,16 +252,18 @@ def storage_format(self, force=False, timeout=30, verbose=False): self.timeout = saved_timeout return self.result - def storage_set_faulty(self, uuid, force=True): + def storage_set_faulty(self, host, uuid, force=True): """Get the result of the 'dmg storage set nvme-faulty' command. Args: + host (str): Identifier of host on which action should be performed. uuid (str): Device UUID to query. force (bool, optional): Force setting device state to FAULTY. Defaults to True. """ return self._get_json_result( - ("storage", "set", "nvme-faulty"), uuid=uuid, force=force) + ("storage", "set", "nvme-faulty"), host=host, uuid=uuid, + force=force) def storage_query_list_devices(self, rank=None, health=False, uuid=None): """Get the result of the 'dmg storage query list-devices' command. @@ -338,13 +340,13 @@ def storage_led_check(self, ids=None): return self._get_json_result( ("storage", "led", "check"), ids=ids) - def storage_replace_nvme(self, old_uuid, new_uuid, no_reint=False): + def storage_replace_nvme(self, host, old_uuid, new_uuid): """Get the result of the 'dmg storage replace nvme' command. Args: + host (str): Identifier of host on which action should be performed. old_uuid (str): Old NVME Device ID. new_uuid (str): New NVME Device ID replacing the old device. - no_reint (bool, optional): Don't perform reintegration. Defaults to False. Returns: dict: JSON formatted dmg command result. @@ -354,8 +356,8 @@ def storage_replace_nvme(self, old_uuid, new_uuid, no_reint=False): """ return self._get_json_result( - ("storage", "replace", "nvme"), old_uuid=old_uuid, - new_uuid=new_uuid, no_reint=no_reint) + ("storage", "replace", "nvme"), host=host, old_uuid=old_uuid, + new_uuid=new_uuid) def storage_scan_nvme_health(self): """Get the result of the 'dmg storage scan --nvme-health' command. diff --git a/src/tests/ftest/util/dmg_utils_base.py b/src/tests/ftest/util/dmg_utils_base.py index a601b590033..39109320af1 100644 --- a/src/tests/ftest/util/dmg_utils_base.py +++ b/src/tests/ftest/util/dmg_utils_base.py @@ -686,7 +686,7 @@ def __init__(self): super().__init__("/run/dmg/storage/replace/nvme/*", "nvme") self.old_uuid = FormattedParameter("--old-uuid {}", None) self.new_uuid = FormattedParameter("--new-uuid {}", None) - self.no_reint = FormattedParameter("--no-reint", False) + self.host = FormattedParameter("--host {}", None) class LedSubCommand(CommandWithSubCommand): """Defines an object for the dmg storage LED command""" @@ -810,6 +810,7 @@ def __init__(self): super().__init__("/run/dmg/storage/query/device-state/*", "nvme-faulty") self.uuid = FormattedParameter("-u {}", None) self.force = FormattedParameter("--force", False) + self.host = FormattedParameter("--host {}", None) class SystemSubCommand(CommandWithSubCommand): """Defines an object for the dmg system sub command.""" @@ -955,7 +956,7 @@ class ListSubCommand(CommandWithParameters): def __init__(self): """Create a dmg telemetry metrics list object.""" super().__init__("/run/dmg/telemetry/metrics/list/*", "list") - self.host = FormattedParameter("--host-list={}", None) + self.host = FormattedParameter("--host={}", None) self.port = FormattedParameter("--port={}", None) class QuerySubCommand(CommandWithParameters): @@ -964,7 +965,7 @@ class QuerySubCommand(CommandWithParameters): def __init__(self): """Create a dmg telemetry metrics query object.""" super().__init__("/run/dmg/telemetry/metrics/query/*", "query") - self.host = FormattedParameter("--host-list={}", None) + self.host = FormattedParameter("--host={}", None) self.port = FormattedParameter("--port={}", None) self.metrics = FormattedParameter("--metrics={}", None) diff --git a/src/tests/ftest/util/file_count_test_base.py b/src/tests/ftest/util/file_count_test_base.py index be21183c97a..12c66d76b8c 100644 --- a/src/tests/ftest/util/file_count_test_base.py +++ b/src/tests/ftest/util/file_count_test_base.py @@ -97,7 +97,7 @@ def run_file_count(self): self.processes = mdtest_np self.ppn = mdtest_ppn if self.mdtest_cmd.api.value == 'POSIX': - self.mdtest_cmd.env.update(LD_PRELOAD=intercept, D_IL_REPORT='1') + self.mdtest_cmd.env.update(LD_PRELOAD=intercept) self.execute_mdtest() else: self.execute_mdtest() diff --git a/src/tests/ftest/util/ior_test_base.py b/src/tests/ftest/util/ior_test_base.py index 625a283593e..7a7955d78a5 100644 --- a/src/tests/ftest/util/ior_test_base.py +++ b/src/tests/ftest/util/ior_test_base.py @@ -6,10 +6,9 @@ import os from apricot import TestWithServers -from ClusterShell.NodeSet import NodeSet from dfuse_utils import get_dfuse, start_dfuse from exception_utils import CommandFailure -from general_utils import get_random_string, pcmd +from general_utils import get_random_string from host_utils import get_local_host from ior_utils import IorCommand from job_manager_utils import get_job_manager @@ -225,8 +224,6 @@ def run_ior(self, manager, processes, intercept=None, display_space=True, env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env['LD_PRELOAD'] = intercept - if 'D_IL_REPORT' not in env: - env['D_IL_REPORT'] = '1' if plugin_path: env["HDF5_VOL_CONNECTOR"] = "daos" env["HDF5_PLUGIN_PATH"] = str(plugin_path) @@ -371,59 +368,3 @@ def verify_pool_size(self, original_pool_info, processes): self.fail( "Pool Free Size did not match: actual={}, expected={}".format( actual_pool_size, expected_pool_size)) - - def execute_cmd(self, command, fail_on_err=True, display_output=True): - """Execute cmd using general_utils.pcmd. - - Args: - command (str): the command to execute on the client hosts - fail_on_err (bool, optional): whether or not to fail the test if - command returns a non zero return code. Defaults to True. - display_output (bool, optional): whether or not to display output. - Defaults to True. - - Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. - - """ - try: - # Execute the bash command on each client host - result = self._execute_command(command, fail_on_err, display_output) - - except CommandFailure as error: - # Report an error if any command fails - self.log.error("Failed to execute command: %s", str(error)) - self.fail("Failed to execute command") - - return result - - def _execute_command(self, command, fail_on_err=True, display_output=True, hosts=None): - """Execute the command on all client hosts. - - Optionally verify if the command returns a non zero return code. - - Args: - command (str): the command to execute on the client hosts - fail_on_err (bool, optional): whether or not to fail the test if - command returns a non zero return code. Defaults to True. - display_output (bool, optional): whether or not to display output. - Defaults to True. - - Raises: - CommandFailure: if 'fail_on_err' is set and the command fails on at - least one of the client hosts - - Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. - - """ - if hosts is None: - hosts = self.hostlist_clients - result = pcmd(hosts, command, verbose=display_output, timeout=300) - if (0 not in result or len(result) > 1) and fail_on_err: - hosts = [str(nodes) for code, nodes in list(result.items()) if code != 0] - raise CommandFailure("Error running '{}' on the following hosts: {}".format( - command, NodeSet(",".join(hosts)))) - return result diff --git a/src/tests/ftest/util/ior_utils.py b/src/tests/ftest/util/ior_utils.py index 7851e4587d7..6edf4eaf696 100644 --- a/src/tests/ftest/util/ior_utils.py +++ b/src/tests/ftest/util/ior_utils.py @@ -461,21 +461,6 @@ def get_ior_metrics(cmdresult): return (write_metrics, read_metrics) - @staticmethod - def log_metrics(logger, message, metrics): - """Log the ior metrics. - - Args: - logger (log): logger object handle - message (str) : Message to print before logging metrics - metric (lst) : IOR write and read metrics - """ - logger.info("\n") - logger.info(message) - for metric in metrics: - logger.info(metric) - logger.info("\n") - class IorMetrics(IntEnum): """Index Name and Number of each column in IOR result summary.""" @@ -588,7 +573,7 @@ def get_unique_log(self, container): return '.'.join(['_'.join(parts), 'log']) def run(self, pool, container, processes, ppn=None, intercept=None, plugin_path=None, - dfuse=None, display_space=True, fail_on_warning=False, unique_log=True, il_report=1): + dfuse=None, display_space=True, fail_on_warning=False, unique_log=True, il_report=None): # pylint: disable=too-many-arguments """Run ior. @@ -609,7 +594,7 @@ def run(self, pool, container, processes, ppn=None, intercept=None, plugin_path= unique_log (bool, optional): whether or not to update the log file with a new unique log file name. Defaults to True. il_report (int, optional): D_IL_REPORT value to use when 'intercept' is specified and a - value does not already exist in the environment. Defaults to 1. + value does not already exist in the environment. Defaults to None. Raises: CommandFailure: if there is an error running the ior command @@ -627,7 +612,7 @@ def run(self, pool, container, processes, ppn=None, intercept=None, plugin_path= self.env["LD_PRELOAD"] = intercept if "D_LOG_MASK" not in self.env: self.env["D_LOG_MASK"] = "INFO" - if "D_IL_REPORT" not in self.env: + if "D_IL_REPORT" not in self.env and il_report is not None: self.env["D_IL_REPORT"] = str(il_report) if plugin_path: diff --git a/src/tests/ftest/util/nvme_utils.py b/src/tests/ftest/util/nvme_utils.py index 4f4c9dd8eea..2394e22e10e 100644 --- a/src/tests/ftest/util/nvme_utils.py +++ b/src/tests/ftest/util/nvme_utils.py @@ -55,7 +55,7 @@ def set_device_faulty(test, dmg, server, uuid, pool=None, has_sys_xs=False, **kw dict: the json response from the dmg storage set-faulty command. """ - dmg.hostlist = server + kwargs['host'] = server kwargs['uuid'] = uuid try: response = get_dmg_response(dmg.storage_set_faulty, **kwargs) diff --git a/src/tests/ftest/util/run_utils.py b/src/tests/ftest/util/run_utils.py index 2f9d33b07c5..f4893558fb0 100644 --- a/src/tests/ftest/util/run_utils.py +++ b/src/tests/ftest/util/run_utils.py @@ -345,7 +345,8 @@ def log_result_data(log, data): log.debug("%s%s", " " * indent, line) -def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False): +def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False, + timeout=None, fanout=None): """Get the clush command with optional sudo arguments. Args: @@ -355,14 +356,21 @@ def get_clush_command(hosts, args=None, command="", command_env=None, command_su command_env (EnvironmentVariables, optional): environment variables to export with the command. Defaults to None. sudo (bool, optional): whether to run the command with sudo privileges. Defaults to False. + timeout (int, optional): number of seconds to wait for the command to complete. + Defaults to None. + fanout (int, optional): fanout to use. Default uses the max of the + clush default (64) or available cores Returns: str: the clush command """ - cmd_list = ["clush"] + if fanout is None: + fanout = max(64, len(os.sched_getaffinity(0))) + cmd_list = ["clush", "-f", str(fanout), "-w", str(hosts)] + if timeout is not None: + cmd_list.extend(["-u", str(timeout)]) if args: cmd_list.append(args) - cmd_list.extend(["-w", str(hosts)]) # If ever needed, this is how to disable host key checking: # cmd_list.extend(["-o", "-oStrictHostKeyChecking=no"]) cmd_list.append(command_as_user(command, "root" if command_sudo else "", command_env)) diff --git a/src/tests/ftest/util/soak_utils.py b/src/tests/ftest/util/soak_utils.py index 39178e0e0d9..9e523c6096c 100644 --- a/src/tests/ftest/util/soak_utils.py +++ b/src/tests/ftest/util/soak_utils.py @@ -997,10 +997,8 @@ def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob, oclass_list=None, mpirun_cmd.get_params(self) if api == "POSIX-LIBPIL4DFS": env["LD_PRELOAD"] = os.path.join(self.prefix, 'lib64', 'libpil4dfs.so') - env["D_IL_REPORT"] = "1" if api == "POSIX-LIBIOIL": env["LD_PRELOAD"] = os.path.join(self.prefix, 'lib64', 'libioil.so') - env["D_IL_REPORT"] = "1" # add envs if api is HDF5-VOL if api == "HDF5-VOL": vol = True @@ -1166,10 +1164,8 @@ def create_mdtest_cmdline(self, job_spec, pool, ppn, nodesperjob): if self.enable_il and api == "POSIX-LIBPIL4DFS": env["LD_PRELOAD"] = os.path.join( self.prefix, 'lib64', 'libpil4dfs.so') - env["D_IL_REPORT"] = "1" if self.enable_il and api == "POSIX-LIBIOIL": env["LD_PRELOAD"] = os.path.join(self.prefix, 'lib64', 'libioil.so') - env["D_IL_REPORT"] = "1" mpirun_cmd = Mpirun(mdtest_cmd, mpi_type=self.mpi_module) mpirun_cmd.get_params(self) mpirun_cmd.assign_processes(nodesperjob * ppn) @@ -1297,10 +1293,8 @@ def create_fio_cmdline(self, job_spec, pool): cmds.append(f"cd {dfuse.mount_dir.value};") if self.enable_il and api == "POSIX-LIBPIL4DFS": cmds.append(f"export LD_PRELOAD={os.path.join(self.prefix, 'lib64', 'libpil4dfs.so')}") - cmds.append("export D_IL_REPORT=1") if self.enable_il and api == "POSIX-LIBIOIL": cmds.append(f"export LD_PRELOAD={os.path.join(self.prefix, 'lib64', 'libioil.so')}") - cmds.append("export D_IL_REPORT=1") cmds.append(str(fio_cmd)) cmds.append("status=$?") cmds.append("cd -") @@ -1372,10 +1366,8 @@ def create_app_cmdline(self, job_spec, pool, ppn, nodesperjob): env["DAOS_UNS_PREFIX"] = format_path(pool, self.container[-1]) if self.enable_il and api == "POSIX-LIBPIL4DFS": env["LD_PRELOAD"] = os.path.join(self.prefix, 'lib64', 'libpil4dfs.so') - env["D_IL_REPORT"] = "1" if self.enable_il and api == "POSIX-LIBIOIL": env["LD_PRELOAD"] = os.path.join(self.prefix, 'lib64', 'libioil.so') - env["D_IL_REPORT"] = "1" mpirun_cmd.assign_environment(env, True) mpirun_cmd.assign_processes(nodesperjob * ppn) mpirun_cmd.ppn.update(ppn) diff --git a/src/tests/ftest/vmd/fault_reintegration.py b/src/tests/ftest/vmd/fault_reintegration.py index b7220d42943..f74a8bda549 100644 --- a/src/tests/ftest/vmd/fault_reintegration.py +++ b/src/tests/ftest/vmd/fault_reintegration.py @@ -91,8 +91,8 @@ def test_nvme_fault_reintegration(self): 9. Replace the same drive back. 10. Drive status LED should be off indicating good device is plugged-in. - :avocado: tags=all,manual - :avocado: tags=hw,medium + :avocado: tags=all,full_regression + :avocado: tags=hw_vmd,medium :avocado: tags=vmd,vmd_led :avocado: tags=NvmeFaultReintegrate,test_nvme_fault_reintegration @@ -168,7 +168,8 @@ def test_nvme_fault_reintegration(self): self.log_step( "Marking the {} device as faulty and verifying it is 'EVICTED' and its " "LED is 'ON'".format(test_dev)) - set_device_faulty(self, self.dmg, self.dmg.hostlist, test_dev, self.pool) + set_device_faulty(self, self.dmg, self.hostlist_servers[0], test_dev, + self.pool) # check device state after set nvme-faulty if not self.verify_dev_led_state(test_dev, "EVICTED", "ON"): @@ -215,7 +216,8 @@ def test_nvme_fault_reintegration(self): # 9. self.log_step("Replace the same drive back.") get_dmg_response( - self.dmg.storage_replace_nvme, old_uuid=test_dev, new_uuid=test_dev) + self.dmg.storage_replace_nvme, host=self.hostlist_servers[0], + old_uuid=test_dev, new_uuid=test_dev) # Wait for rebuild to start self.pool.wait_for_rebuild_to_start() # Wait for rebuild to complete diff --git a/src/tests/ftest/vmd/led.py b/src/tests/ftest/vmd/led.py index d6cb33330d9..eb49e8289c0 100644 --- a/src/tests/ftest/vmd/led.py +++ b/src/tests/ftest/vmd/led.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -76,8 +76,8 @@ def get_led_status_value(self, device_id=None): def test_vmd_led_status(self): """Jira ID: DAOS-11290 - :avocado: tags=all,manual - :avocado: tags=hw,medium + :avocado: tags=all,full_regression + :avocado: tags=hw_vmd,medium :avocado: tags=vmd,vmd_led :avocado: tags=VmdLedStatus,test_vmd_led_status """ @@ -97,8 +97,8 @@ def test_vmd_led_status(self): def test_vmd_led_faulty(self): """Jira ID: DAOS-11290 - :avocado: tags=all,manual - :avocado: tags=hw,medium + :avocado: tags=all,full_regression + :avocado: tags=hw_vmd,medium :avocado: tags=vmd,vmd_led :avocado: tags=VmdLedStatus,test_vmd_led_faulty """ @@ -126,11 +126,13 @@ def test_disk_failure_recover(self): uuid_list = sorted(uuid_dict.keys()) self.log.info("Devices on hosts %s: %s", hosts, uuid_list) self.log.info("First device on hosts %s: %s", hosts, uuid_list[0]) - resp = set_device_faulty(self, self.dmg, hosts.split(':')[0], uuid_list[0]) + host = hosts.split(':')[0] + resp = set_device_faulty(self, self.dmg, host, uuid_list[0]) self.log.info("Sleeping for 15 seconds ...") time.sleep(15) self.log.info(resp) - resp = self.dmg.storage_replace_nvme(old_uuid=uuid_list[0], new_uuid=uuid_list[0]) + resp = self.dmg.storage_replace_nvme( + host=host, old_uuid=uuid_list[0], new_uuid=uuid_list[0]) self.log.info("Sleeping for 60 seconds ...") time.sleep(60) self.log.info(resp) diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index c45bcbfff01..3bd44d646f3 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -15,7 +15,7 @@ Name: daos Version: 2.6.1 -Release: 2%{?relval}%{?dist} +Release: 3%{?relval}%{?dist} Summary: DAOS Storage Engine License: BSD-2-Clause-Patent @@ -594,6 +594,9 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent # No files in a shim package %changelog +* Tue Oct 01 2024 Phillip Henderson 2.6.1-3 +- Third release candidate for 2.6.1 + * Fri Sep 20 2024 Phillip Henderson 2.6.1-2 - Second release candidate for 2.6.1