Skip to content

Commit

Permalink
DAOS-8331 client: add client side metrics
Browse files Browse the repository at this point in the history
1. Move TLS to common, so both client and server can have TLS,
which metrics can be attached metrics on it.

2. Add object metrics on the client side, enabled by
export DAOS_CLIENT_METRICS=1. And client metrics are organized
as "root/jobid/pid/xxxxx"

And root/jobid/pid are stored in an independent share memory,
which will only be destoryed if all jobs are destroyed.

During each daos thread initialization, it will created another
shmem (pid/xxx), which all metrics of the thread will be attached
to. And this metric will be destoryed once the thread exit, though
if DAOS_CLIENT_METRICS_RETAIN is set, these client metrics will be
retain, and it can be retrieved by
	daos_metrics --jobid

3. Add DAOS_METRIC_DUMP_ENV dump metrics from current thread
once it exit.

4. Some fixes in telemetrics about conv_ptr during re-open the
share memory.

5. Add daos_metrics --jobid XXX options to retrieve all metrics
of the job.

Required-githooks: true
Signed-off-by: Di Wang <di.wang@intel.com>
  • Loading branch information
Di Wang committed Dec 18, 2023
1 parent 40dd690 commit d195131
Show file tree
Hide file tree
Showing 26 changed files with 1,282 additions and 459 deletions.
2 changes: 1 addition & 1 deletion src/client/api/SConscript
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Build DAOS client"""

LIBDAOS_SRC = ['agent.c', 'array.c', 'container.c', 'event.c', 'init.c', 'job.c', 'kv.c', 'mgmt.c',
'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c', 'pipeline.c']
'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c', 'pipeline.c', 'metrics.c']


def scons():
Expand Down
12 changes: 10 additions & 2 deletions src/client/api/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <daos/btree_class.h>
#include <daos/placement.h>
#include <daos/job.h>
#include <daos/metric.h>
#if BUILD_PIPELINE
#include <daos/pipeline.h>
#endif
Expand Down Expand Up @@ -242,19 +243,25 @@ daos_init(void)
if (rc != 0)
D_GOTO(out_co, rc);

rc = dc_tm_init();
if (rc)
D_GOTO(out_obj, rc);

#if BUILD_PIPELINE
/** set up pipeline */
rc = dc_pipeline_init();
if (rc != 0)
D_GOTO(out_obj, rc);
D_GOTO(out_tm, rc);
#endif
module_initialized++;
D_GOTO(unlock, rc = 0);

#if BUILD_PIPELINE
out_tm:
dc_tm_fini();
#endif
out_obj:
dc_obj_fini();
#endif
out_co:
dc_cont_fini();
out_pool:
Expand Down Expand Up @@ -322,6 +329,7 @@ daos_fini(void)
D_ERROR("failed to disconnect some resources may leak, "
DF_RC"\n", DP_RC(rc));

dc_tm_fini();
dc_agent_fini();
dc_job_fini();

Expand Down
167 changes: 167 additions & 0 deletions src/client/api/metrics.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
/*
* (C) Copyright 2020-2023 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/utsname.h>
#include <daos/common.h>
#include <daos/job.h>
#include <daos/tls.h>
#include <gurt/telemetry_common.h>
#include <gurt/telemetry_consumer.h>
#include <gurt/telemetry_producer.h>

#define INIT_JOB_NUM 1024
bool client_metric;
bool client_metric_retain;

#define MAX_IDS_SIZE(num) (num * D_TM_METRIC_SIZE)
/* The client side metrics structure looks like
* root/job_id/pid/....
*/
int
dc_tm_init(void)
{
struct d_tm_node_t *job_node;
struct d_tm_context *current_ctx;
struct daos_thread_local_storage *dtls;
int metrics_tag;
pid_t pid;
int rc;

d_getenv_bool(DAOS_CLIENT_METRICS_ENV, &client_metric);
if (!client_metric)
return 0;

d_getenv_bool(DAOS_CLIENT_METRICS_RETAIN_ENV, &client_metric_retain);

metrics_tag = D_TM_CLIENT_PROCESS | D_TM_OPEN_OR_CREATE;
if (client_metric_retain)
metrics_tag |= D_TM_RETAIN_SHMEM;
else
metrics_tag |= D_TM_RETAIN_SHMEM_IF_NON_EMPTY;

rc = d_tm_init(DC_TM_JOB_ROOT_ID, MAX_IDS_SIZE(INIT_JOB_NUM), metrics_tag);
if (rc != 0) {
D_ERROR("init job root id %u: %d\n", DC_TM_JOB_ROOT_ID, rc);
return rc;
}

pid = getpid();
D_INFO("INIT %s/%u metrics\n", dc_jobid, pid);
rc = d_tm_add_metric(&job_node, D_TM_DIRECTORY,
"job id directory", "dir",
"%s/%u", dc_jobid, pid);
/* Close job root sheme */

Check failure on line 59 in src/client/api/metrics.c

View workflow job for this annotation

GitHub Actions / Codespell

sheme ==> scheme, shame
d_tm_fini();
if (rc != 0) {
D_ERROR("add metric %s/%u failed: %d\n", dc_jobid, pid, rc);
D_GOTO(out, rc);
}

metrics_tag = D_TM_CLIENT_PROCESS;
if (client_metric_retain)
metrics_tag |= D_TM_RETAIN_SHMEM;
rc = d_tm_init(pid, MAX_IDS_SIZE(INIT_JOB_NUM), metrics_tag);
if (rc != 0)
D_GOTO(out, rc);

current_ctx = d_tm_open(pid);
if (current_ctx == NULL)
D_GOTO(out, rc = -DER_NOMEM);

dtls = dc_tls_init(DAOS_CLI_TAG, pid);
if (dtls == NULL)
D_GOTO(out, rc = -DER_NOMEM);
out:
if (rc)
d_tm_fini();

return rc;
}

static void
iter_dump(struct d_tm_context *ctx, struct d_tm_node_t *node, int level,
char *path, int format, int opt_fields, void *arg)
{
d_tm_print_node(ctx, node, level, path, format, opt_fields, (FILE *)arg);
}

static int
dump_tm_file(const char *dump_path)
{
struct d_tm_context *ctx;
struct d_tm_node_t *root;
uint32_t filter;
FILE *dump_file;
pid_t pid;
int rc = 0;

dump_file = fopen(dump_path, "w+");
if (dump_file == NULL) {
D_INFO("cannot open %s", dump_path);
return -DER_INVAL;
}

filter = D_TM_COUNTER | D_TM_DURATION | D_TM_TIMESTAMP | D_TM_MEMINFO |
D_TM_TIMER_SNAPSHOT | D_TM_GAUGE | D_TM_STATS_GAUGE;

pid = getpid();
ctx = d_tm_open(pid);
if (ctx == NULL)
D_GOTO(close, rc = -DER_NOMEM);

root = d_tm_get_root(ctx);
if (root == NULL) {
D_INFO("no root exist for %u\n", pid);
D_GOTO(close_ctx, rc = -DER_NONEXIST);
}

d_tm_print_field_descriptors(0, dump_file);

d_tm_iterate(ctx, root, 0, filter, NULL, D_TM_CSV, 0, iter_dump, dump_file);

close_ctx:
d_tm_close(&ctx);
close:
fclose(dump_file);
return rc;
}

void
dc_tm_fini()
{
pid_t pid = getpid();
char *dump_path;
int rc;

if (!client_metric)
return;

dump_path = getenv(METRIC_DUMP_ENV);
if (dump_path != NULL)
dump_tm_file(dump_path);

dc_tls_fini();
/* close current pid ctct */
d_tm_fini();

if (client_metric_retain)
return;

rc = d_tm_init(DC_TM_JOB_ROOT_ID, MAX_IDS_SIZE(INIT_JOB_NUM),
D_TM_CLIENT_PROCESS | D_TM_RETAIN_SHMEM_IF_NON_EMPTY |
D_TM_OPEN_OR_CREATE);
if (rc != 0)
return;

D_INFO("delete pid %s/%u\n", dc_jobid, pid);
d_tm_del_node("%s/%d", dc_jobid, pid);
d_tm_del_node("%s", dc_jobid);

d_tm_fini();
}
2 changes: 1 addition & 1 deletion src/common/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ COMMON_FILES = ['debug.c', 'mem.c', 'fail_loc.c', 'lru.c',
'dedup.c', 'profile.c', 'compression.c', 'compression_isal.c',
'compression_qat.c', 'multihash.c', 'multihash_isal.c',
'cipher.c', 'cipher_isal.c', 'qat.c', 'fault_domain.c',
'policy.c']
'policy.c', 'tls.c']


def build_daos_common(denv, client):
Expand Down
Loading

0 comments on commit d195131

Please sign in to comment.