Skip to content

Commit

Permalink
DAOS-8331 client: add client side metrics
Browse files Browse the repository at this point in the history
1. Move TLS to common, so both client and server can have TLS,
which metrics can be attached metrics on it.

2. Add object metrics on the client side, enabled by
export DAOS_CLIENT_METRICS=1. And client metrics are organized
as "root/jobid/pid/xxxxx"

And root/jobid/pid are stored in an independent share memory,
which will only be destoryed if all jobs are destroyed.

During each daos thread initialization, it will created another
shmem (pid/xxx), which all metrics of the thread will be attached
to. And this metric will be destoryed once the thread exit, though
if DAOS_CLIENT_METRICS_RETAIN is set, these client metrics will be
retain, and it can be retrieved by
	daos_metrics --jobid

3. Add DAOS_METRIC_DUMP_ENV dump metrics from current thread
once it exit.

4. Some fixes in telemetrics about conv_ptr during re-open the
share memory.

5. Add daos_metrics --jobid XXX options to retrieve all metrics
of the job.

Required-githooks: true
Signed-off-by: Di Wang <di.wang@intel.com>
  • Loading branch information
Di Wang committed Dec 21, 2023
1 parent 70cf75c commit 1e874d0
Show file tree
Hide file tree
Showing 25 changed files with 1,143 additions and 470 deletions.
2 changes: 1 addition & 1 deletion src/client/api/SConscript
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Build DAOS client"""

LIBDAOS_SRC = ['agent.c', 'array.c', 'container.c', 'event.c', 'init.c', 'job.c', 'kv.c', 'mgmt.c',
'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c', 'pipeline.c']
'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c', 'pipeline.c', 'metrics.c']


def scons():
Expand Down
12 changes: 10 additions & 2 deletions src/client/api/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <daos/btree_class.h>
#include <daos/placement.h>
#include <daos/job.h>
#include <daos/metric.h>
#if BUILD_PIPELINE
#include <daos/pipeline.h>
#endif
Expand Down Expand Up @@ -242,19 +243,25 @@ daos_init(void)
if (rc != 0)
D_GOTO(out_co, rc);

rc = dc_tm_init();
if (rc)
D_GOTO(out_obj, rc);

#if BUILD_PIPELINE
/** set up pipeline */
rc = dc_pipeline_init();
if (rc != 0)
D_GOTO(out_obj, rc);
D_GOTO(out_tm, rc);
#endif
module_initialized++;
D_GOTO(unlock, rc = 0);

#if BUILD_PIPELINE
out_tm:
dc_tm_fini();
#endif
out_obj:
dc_obj_fini();
#endif
out_co:
dc_cont_fini();
out_pool:
Expand Down Expand Up @@ -322,6 +329,7 @@ daos_fini(void)
D_ERROR("failed to disconnect some resources may leak, "
DF_RC"\n", DP_RC(rc));

dc_tm_fini();
dc_agent_fini();
dc_job_fini();

Expand Down
146 changes: 146 additions & 0 deletions src/client/api/metrics.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
/*
* (C) Copyright 2020-2023 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/utsname.h>
#include <daos/common.h>
#include <daos/job.h>
#include <daos/tls.h>
#include <gurt/telemetry_common.h>
#include <gurt/telemetry_consumer.h>
#include <gurt/telemetry_producer.h>

#define INIT_JOB_NUM 1024
bool daos_client_metric;
bool daos_client_metric_retain;

#define MAX_IDS_SIZE(num) (num * D_TM_METRIC_SIZE)
/* The client side metrics structure looks like
* root/job_id/pid/....
*/
int
dc_tm_init(void)
{
int metrics_tag;
pid_t pid;
int rc;

d_getenv_bool(DAOS_CLIENT_METRICS_ENV, &daos_client_metric);
if (!daos_client_metric)
return 0;

rc = dc_tls_key_create();
if (rc)
D_GOTO(out, rc);

metrics_tag = D_TM_OPEN_OR_CREATE;
d_getenv_bool(DAOS_CLIENT_METRICS_RETAIN_ENV, &daos_client_metric_retain);
if (daos_client_metric_retain)
metrics_tag |= D_TM_RETAIN_SHMEM;
else
metrics_tag |= D_TM_RETAIN_SHMEM_IF_NON_EMPTY;

rc = d_tm_init(DC_TM_JOB_ROOT_ID, MAX_IDS_SIZE(INIT_JOB_NUM), metrics_tag);
if (rc != 0) {
DL_ERROR(rc, "init job root id.");
return rc;
}

pid = getpid();
D_INFO("INIT %s/%u metrics\n", dc_jobid, pid);

/** create new shmem space for per-pool metrics */
rc = d_tm_add_ephemeral_dir(NULL, MAX_IDS_SIZE(INIT_JOB_NUM), "%s/%u",
dc_jobid, pid);
if (rc != 0) {
DL_ERROR(rc, "add metric %s/%u failed.\n", dc_jobid, pid);

Check warning on line 61 in src/client/api/metrics.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
D_GOTO(out, rc);
}

out:
if (rc)
d_tm_fini();

return rc;
}

static void
iter_dump(struct d_tm_context *ctx, struct d_tm_node_t *node, int level,
char *path, int format, int opt_fields, void *arg)
{
d_tm_print_node(ctx, node, level, path, format, opt_fields, (FILE *)arg);
}

static int
dump_tm_file(const char *dump_path)
{
struct d_tm_context *ctx;
struct d_tm_node_t *root;
char dirname[D_TM_MAX_NAME_LEN] = {0};
uint32_t filter;
FILE *dump_file;
int rc;

dump_file = fopen(dump_path, "w+");
if (dump_file == NULL) {
D_INFO("cannot open %s", dump_path);
return -DER_INVAL;
}

filter = D_TM_COUNTER | D_TM_DURATION | D_TM_TIMESTAMP | D_TM_MEMINFO |
D_TM_TIMER_SNAPSHOT | D_TM_GAUGE | D_TM_STATS_GAUGE;

ctx = d_tm_open(DC_TM_JOB_ROOT_ID);
if (ctx == NULL)
D_GOTO(close, rc = -DER_NOMEM);

snprintf(dirname, sizeof(dirname), "%s/%u", dc_jobid, getpid());
root = d_tm_find_metric(ctx, dirname);
if (root == NULL) {
printf("No metrics found at: '%s'\n", dirname);
D_GOTO(close_ctx, rc = -DER_NONEXIST);
}

d_tm_print_field_descriptors(0, dump_file);

d_tm_iterate(ctx, root, 0, filter, NULL, D_TM_CSV, 0, iter_dump, dump_file);

close_ctx:
d_tm_close(&ctx);
close:
fclose(dump_file);
return rc;
}

void
dc_tm_fini()
{
pid_t pid = getpid();
char *dump_path;
int rc;

if (!daos_client_metric)
return;

dump_path = getenv(METRIC_DUMP_ENV);
D_INFO("dump path is %s\n", dump_path);
if (dump_path != NULL)
dump_tm_file(dump_path);

dc_tls_fini();
dc_tls_key_delete();

if (!daos_client_metric_retain) {
rc = d_tm_del_ephemeral_dir("%s/%d", dc_jobid, pid);
if (rc != 0)
DL_ERROR(rc, "delete tm directory %s/%d.", dc_jobid, pid);
}

D_INFO("delete pid %s/%u\n", dc_jobid, pid);
d_tm_fini();
}
2 changes: 1 addition & 1 deletion src/common/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ COMMON_FILES = ['debug.c', 'mem.c', 'fail_loc.c', 'lru.c',
'dedup.c', 'profile.c', 'compression.c', 'compression_isal.c',
'compression_qat.c', 'multihash.c', 'multihash_isal.c',
'cipher.c', 'cipher_isal.c', 'qat.c', 'fault_domain.c',
'policy.c']
'policy.c', 'tls.c']


def build_daos_common(denv, client):
Expand Down
Loading

0 comments on commit 1e874d0

Please sign in to comment.