Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-8331 client: add client side metrics #13517

Closed
wants to merge 30 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
f2f73e3
DAOS-8331 client: add client side metrics
Dec 8, 2023
32973fc
Merge branch 'master' into wangdi/dc_tm
Jan 1, 2024
5cb77b8
Merge branch 'master' into wangdi/dc_tm
Jan 2, 2024
13b44a2
DAOS-8331 pool: add per pool metrics
Jan 2, 2024
f773fc9
Merge branch 'master' into wangdi/dc_tm
Jan 3, 2024
5dfad80
DAOS-8331 object: fix metrics format
Jan 3, 2024
f33d9f6
DAOS-8331 pool: fix building failure
Jan 3, 2024
402eeb0
DAOS-8331 client: destroy metrics if not retain
Jan 5, 2024
b938c3e
DAOS-8331 gurt: fix memory corruption
Jan 5, 2024
4d068b7
Merge branch 'master' into wangdi/dc_tm
Jan 5, 2024
0b45f82
DAOS-8331 pool: Make per pool metrices independent
Jan 6, 2024
f191218
Merge branch 'master' into wangdi/dc_tm
Jan 6, 2024
7910300
DAOS-8331 pool: missing metrics enable checking
Jan 6, 2024
c33153e
Merge branch 'master' into wangdi/dc_tm
Jan 6, 2024
8182b70
DAOS-8331 object: do not get time if metrics are not enabled.
Jan 8, 2024
5572d24
DAOS-8331 client: Update patch to fix environment
Jan 9, 2024
737676e
Merge branch 'master' into wangdi/dc_tm
Jan 9, 2024
c363d7b
DAOS-8331 client: fix environment VAR name
Jan 9, 2024
12e0ec7
Merge branch 'master' into wangdi/dc_tm
Jan 22, 2024
df4c187
Merge branch 'master' into wangdi/dc_tm
Feb 5, 2024
fa5eea6
DAOS-8331 objects: fix building failure.
Feb 5, 2024
a66b4fc
Merge branch 'master' into wangdi/dc_tm
Feb 6, 2024
0dc0696
DAOS-8331 client: Fix segfault
Feb 6, 2024
3a1c930
DAOS-8331: gurt: add ephemeral lock for ephemeral directory
Feb 7, 2024
e309fda
DAOS-8331 gurt: fix input check
Feb 8, 2024
d5a8748
DAOS-8331 gurt: miss lock in ephemeral delete
Feb 8, 2024
c87d38d
DAOS-8331 gurt: fix typo
Feb 8, 2024
8189aec
Merge branch 'master' into wangdi/dc_tm
Feb 20, 2024
fc4dbda
Merge branch 'master' into wangdi/dc_tm
Feb 20, 2024
11c411f
DAOS-8331 client: resolve Kris comments
Feb 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/cart/README.env
Original file line number Diff line number Diff line change
Expand Up @@ -174,3 +174,15 @@ This file lists the environment variables used in CaRT.
. CRT_TEST_CONT
When set to 1, orterun does not automatically shut down other servers when
one server is shutdown. Used in cart internal testing.

. D_CLIENT_METRICS_ENABLE
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should these be added to the list in src/cart/crt_init.c as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, sure.

When set to 1, client side metrics will be collected on each daos client, which
can by retrieved by daos_metrics -j job_id on each client.

. D_CLIENT_METRICS_RETAIN
when set to 1, client side metrics will be retained even after the job exits, i.e.
those metrics can be retrieved by daos_metrics even after job exits.

. D_CLIENT_METRICS_DUMP_PATH
Set client side metrics dump path(file) for each client, so these metrics will be
dumped to the specified file when the job exits.
7 changes: 6 additions & 1 deletion src/cart/crt_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,12 @@ static const char *crt_env_names[] = {"D_PROVIDER",
"D_QUOTA_RPCS",
"D_POST_INIT",
"D_POST_INCR",
"DAOS_SIGNAL_REGISTER"};
"DAOS_SIGNAL_REGISTER",
"D_CLIENT_METRICS_ENABLE",
"D_CLIENT_METRICS_RETAIN",
"D_CLIENT_METRICS_DUMP_PATH",

};

static void
crt_lib_init(void) __attribute__((__constructor__));
Expand Down
2 changes: 1 addition & 1 deletion src/client/api/SConscript
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Build DAOS client"""

LIBDAOS_SRC = ['agent.c', 'array.c', 'container.c', 'event.c', 'init.c', 'job.c', 'kv.c', 'mgmt.c',
'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c', 'pipeline.c']
'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c', 'pipeline.c', 'metrics.c']


def scons():
Expand Down
15 changes: 12 additions & 3 deletions src/client/api/init.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -23,6 +23,7 @@
#include <daos/btree_class.h>
#include <daos/placement.h>
#include <daos/job.h>
#include <daos/metrics.h>
#if BUILD_PIPELINE
#include <daos/pipeline.h>
#endif
Expand Down Expand Up @@ -242,19 +243,25 @@ daos_init(void)
if (rc != 0)
D_GOTO(out_co, rc);

rc = dc_tm_init();
if (rc)
D_GOTO(out_obj, rc);

#if BUILD_PIPELINE
/** set up pipeline */
rc = dc_pipeline_init();
if (rc != 0)
D_GOTO(out_obj, rc);
D_GOTO(out_tm, rc);
#endif
module_initialized++;
D_GOTO(unlock, rc = 0);

#if BUILD_PIPELINE
out_tm:
dc_tm_fini();
#endif
out_obj:
dc_obj_fini();
#endif
out_co:
dc_cont_fini();
out_pool:
Expand Down Expand Up @@ -309,6 +316,7 @@ daos_fini(void)
D_GOTO(unlock, rc);
}

daos_metrics_fini();
#if BUILD_PIPELINE
dc_pipeline_fini();
#endif
Expand All @@ -322,6 +330,7 @@ daos_fini(void)
D_ERROR("failed to disconnect some resources may leak, "
DF_RC"\n", DP_RC(rc));

dc_tm_fini();
dc_agent_fini();
dc_job_fini();

Expand Down
157 changes: 157 additions & 0 deletions src/client/api/metrics.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
/*
* (C) Copyright 2020-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/utsname.h>
#include <daos/common.h>
#include <daos/job.h>
#include <daos/tls.h>
#include <daos/metrics.h>
#include <gurt/telemetry_common.h>
#include <gurt/telemetry_consumer.h>
#include <gurt/telemetry_producer.h>

#define INIT_JOB_NUM 1024
bool daos_client_metric;
bool daos_client_metric_retain;

#define MAX_IDS_SIZE(num) (num * D_TM_METRIC_SIZE)
/* The client side metrics structure looks like
* root/job_id/pid/....
*/
int
dc_tm_init(void)
{
int metrics_tag;
pid_t pid;
int rc;

d_getenv_bool(DAOS_CLIENT_METRICS_ENABLE, &daos_client_metric);
if (!daos_client_metric)
return 0;

rc = dc_tls_key_create();
if (rc)
D_GOTO(out, rc);

metrics_tag = D_TM_OPEN_OR_CREATE | D_TM_MULTIPLE_WRITER_LOCK;
d_getenv_bool(DAOS_CLIENT_METRICS_RETAIN, &daos_client_metric_retain);
if (daos_client_metric_retain)
metrics_tag |= D_TM_RETAIN_SHMEM;
else
metrics_tag |= D_TM_RETAIN_SHMEM_IF_NON_EMPTY;

rc = d_tm_init(DC_TM_JOB_ROOT_ID, MAX_IDS_SIZE(INIT_JOB_NUM), metrics_tag);
if (rc != 0) {
DL_ERROR(rc, "init job root id.");
return rc;
}

D_INFO("INIT %s metrics\n", dc_jobid);
rc = d_tm_add_ephemeral_dir(NULL, MAX_IDS_SIZE(INIT_JOB_NUM), "%s", dc_jobid);
if (rc != 0 && rc != -DER_EXIST) {
DL_ERROR(rc, "add metric %s failed", dc_jobid);
D_GOTO(out, rc);
}

pid = getpid();
D_INFO("INIT %s/%u metrics\n", dc_jobid, pid);
rc = d_tm_add_ephemeral_dir(NULL, MAX_IDS_SIZE(INIT_JOB_NUM), "%s/%u",
mjmac marked this conversation as resolved.
Show resolved Hide resolved
dc_jobid, pid);
if (rc != 0) {
DL_ERROR(rc, "add metric %s/%u failed.\n", dc_jobid, pid);

Check warning on line 67 in src/client/api/metrics.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
D_GOTO(out, rc);
}

out:
if (rc)
d_tm_fini();

return rc;
}

static void
iter_dump(struct d_tm_context *ctx, struct d_tm_node_t *node, int level,
char *path, int format, int opt_fields, void *arg)
{
d_tm_print_node(ctx, node, level, path, format, opt_fields, (FILE *)arg);
}

static int
dump_tm_file(const char *dump_path)
{
struct d_tm_context *ctx;
struct d_tm_node_t *root;
char dirname[D_TM_MAX_NAME_LEN] = {0};
uint32_t filter;
FILE *dump_file;
int rc = 0;

dump_file = fopen(dump_path, "w+");
if (dump_file == NULL) {
D_INFO("cannot open %s", dump_path);
return -DER_INVAL;
}

filter = D_TM_COUNTER | D_TM_DURATION | D_TM_TIMESTAMP | D_TM_MEMINFO |
D_TM_TIMER_SNAPSHOT | D_TM_GAUGE | D_TM_STATS_GAUGE;

ctx = d_tm_open(DC_TM_JOB_ROOT_ID);
if (ctx == NULL)
D_GOTO(close, rc = -DER_NOMEM);

snprintf(dirname, sizeof(dirname), "%s/%u", dc_jobid, getpid());
root = d_tm_find_metric(ctx, dirname);
if (root == NULL) {
printf("No metrics found at: '%s'\n", dirname);
D_GOTO(close_ctx, rc = -DER_NONEXIST);
}

d_tm_print_field_descriptors(0, dump_file);

d_tm_iterate(ctx, root, 0, filter, NULL, D_TM_CSV, 0, iter_dump, dump_file);

close_ctx:
d_tm_close(&ctx);
close:
fclose(dump_file);
return rc;
}

void
dc_tm_fini()
{
pid_t pid = getpid();
char *dump_path;
int rc;

if (!daos_client_metric)
return;

dump_path = getenv(DAOS_CLIENT_METRICS_DUMP_PATH);
if (dump_path != NULL) {
D_INFO("dump path is %s\n", dump_path);
dump_tm_file(dump_path);
}

dc_tls_fini();
dc_tls_key_delete();

if (!daos_client_metric_retain) {
rc = d_tm_del_ephemeral_dir("%s/%d", dc_jobid, pid);
if (rc != 0)
DL_ERROR(rc, "delete tm directory %s/%d.", dc_jobid, pid);

rc = d_tm_try_del_ephemeral_dir("%s", dc_jobid);
if (rc != 0)
DL_ERROR(rc, "delete tm directory %s/%d.", dc_jobid, pid);
}

D_INFO("delete pid %s/%u\n", dc_jobid, pid);
d_tm_fini();
}
4 changes: 2 additions & 2 deletions src/common/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
'acl_api.c', 'acl_util.c', 'acl_principal.c', 'cont_props.c',
'dedup.c', 'profile.c', 'compression.c', 'compression_isal.c',
'compression_qat.c', 'multihash.c', 'multihash_isal.c',
'cipher.c', 'cipher_isal.c', 'qat.c', 'fault_domain.c']

'cipher.c', 'cipher_isal.c', 'qat.c', 'fault_domain.c',
'tls.c', 'metrics.c']

def build_daos_common(denv, client):

Check failure on line 14 in src/common/SConscript

View workflow job for this annotation

GitHub Actions / Flake8 check

E302 expected 2 blank lines, found 1
""" Building non-pmem version for client's common lib"""
benv = denv.Clone()
stack_mmap_files = []
Expand Down
Loading
Loading