Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-14561 vos: Add garbage collection metrics (#13244) #13369

Merged
merged 5 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions site_scons/site_tools/go_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ def _check_go_version(context):

# go version go1.2.3 Linux/amd64
go_version = out.split(' ')[2].replace('go', '')
if '-' in go_version:
go_version = go_version.split('-')[0]
if len([x for x, y in
zip(go_version.split('.'), MIN_GO_VERSION.split('.'))
if int(x) < int(y)]) > 0:
Expand Down
2 changes: 1 addition & 1 deletion src/vos/tests/vts_gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ gc_key_test(void **state)
int rc;

rc = gc_key_run(args);
assert_int_equal(rc, 0);
assert_rc_equal(rc, 0);
}

static int
Expand Down
18 changes: 16 additions & 2 deletions src/vos/vos_aggregate.c
Original file line number Diff line number Diff line change
Expand Up @@ -2358,7 +2358,12 @@ vos_aggregate_pre_cb(daos_handle_t ih, vos_iter_entry_t *entry,
}

if (rc < 0) {
struct vos_agg_metrics *vam = agg_cont2metrics(cont);

D_ERROR("VOS aggregation failed: "DF_RC"\n", DP_RC(rc));
if (vam && vam->vam_fail_count)
d_tm_inc_counter(vam->vam_fail_count, 1);

return rc;
}

Expand Down Expand Up @@ -2431,7 +2436,11 @@ vos_aggregate_post_cb(daos_handle_t ih, vos_iter_entry_t *entry,
inc_agg_counter(agg_param, type, AGG_OP_DEL);
rc = 0;
} else if (rc != 0) {
struct vos_agg_metrics *vam = agg_cont2metrics(cont);

D_ERROR("VOS aggregation failed: %d\n", rc);
if (vam && vam->vam_fail_count)
d_tm_inc_counter(vam->vam_fail_count, 1);

/*
* -DER_TX_BUSY error indicates current ilog aggregation
Expand All @@ -2442,8 +2451,6 @@ vos_aggregate_post_cb(daos_handle_t ih, vos_iter_entry_t *entry,
* orphan the current entry due to incarnation log semantics.
*/
if (rc == -DER_TX_BUSY) {
struct vos_agg_metrics *vam = agg_cont2metrics(cont);

agg_param->ap_in_progress = 1;
rc = 0;
switch (type) {
Expand Down Expand Up @@ -2736,6 +2743,13 @@ vos_aggregate(daos_handle_t coh, daos_epoch_range_t *epr,
free_agg_data:
D_FREE(ad);

if (rc < 0) {
struct vos_agg_metrics *vam = agg_cont2metrics(cont);

if (vam && vam->vam_fail_count)
d_tm_inc_counter(vam->vam_fail_count, 1);
}

return rc;
}

Expand Down
21 changes: 21 additions & 0 deletions src/vos/vos_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,12 @@ vos_metrics_alloc(const char *path, int tgt_id)
if (rc)
D_WARN("Failed to create 'merged_size' telemetry : "DF_RC"\n", DP_RC(rc));

/* VOS aggregation failed */
rc = d_tm_add_metric(&vam->vam_fail_count, D_TM_COUNTER, "aggregation failures", NULL,
"%s/%s/fail_count/tgt_%u", path, VOS_AGG_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'fail_count' telemetry : "DF_RC"\n", DP_RC(rc));

/* Metrics related to VOS checkpointing */
vos_chkpt_metrics_init(&vp_metrics->vp_chkpt_metrics, path, tgt_id);

Expand All @@ -768,6 +774,21 @@ vos_metrics_alloc(const char *path, int tgt_id)
if (rc)
D_WARN("Failed to create 'nvme_used' telemetry : "DF_RC"\n", DP_RC(rc));

/* VOS space SCM total metric */
rc = d_tm_add_metric(&vsm->vsm_scm_total, D_TM_GAUGE, "SCM space total", "bytes",
"%s/%s/scm_total/tgt_%u", path, VOS_SPACE_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'scm_total' telemetry : " DF_RC "\n", DP_RC(rc));

/* VOS space NVME total metric */
rc = d_tm_add_metric(&vsm->vsm_nvme_total, D_TM_GAUGE, "NVME space total", "bytes",
"%s/%s/nvme_total/tgt_%u", path, VOS_SPACE_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'nvme_total' telemetry : " DF_RC "\n", DP_RC(rc));

/** garbage collection metrics */
vos_gc_metrics_init(&vp_metrics->vp_gc_metrics, path, tgt_id);

/* Initialize the vos_space_metrics timeout counter */
vsm->vsm_last_update_ts = 0;

Expand Down
126 changes: 121 additions & 5 deletions src/vos/vos_gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,33 @@ gc_get_container(struct vos_pool *pool)
return cont;
}

static void
gc_update_stats(struct vos_pool *pool)
{
struct vos_gc_stat *stat = &pool->vp_gc_stat;
struct vos_gc_stat *gstat = &pool->vp_gc_stat_global;
struct vos_gc_metrics *vgm;

if (pool->vp_metrics != NULL) {
vgm = &pool->vp_metrics->vp_gc_metrics;
d_tm_inc_counter(vgm->vgm_cont_del, stat->gs_conts);
d_tm_inc_counter(vgm->vgm_obj_del, stat->gs_objs);
d_tm_inc_counter(vgm->vgm_dkey_del, stat->gs_dkeys);
d_tm_inc_counter(vgm->vgm_akey_del, stat->gs_akeys);
d_tm_inc_counter(vgm->vgm_ev_del, stat->gs_recxs);
d_tm_inc_counter(vgm->vgm_sv_del, stat->gs_singvs);
}

gstat->gs_conts += stat->gs_conts;
gstat->gs_objs += stat->gs_objs;
gstat->gs_dkeys += stat->gs_dkeys;
gstat->gs_akeys += stat->gs_akeys;
gstat->gs_recxs += stat->gs_recxs;
gstat->gs_singvs += stat->gs_singvs;

memset(stat, 0, sizeof(*stat));
}

/**
* Run garbage collector for a pool, it returns if all @credits are consumed
* or there is nothing to be reclaimed.
Expand All @@ -671,7 +698,7 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)

if (pool->vp_dying) {
*empty_ret = true;
return 0;
D_GOTO(done, rc = 0);
}

/* take an extra ref to avoid concurrent container destroy/free */
Expand All @@ -684,7 +711,8 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)
DP_UUID(pool->vp_id), DP_RC(rc));
if (cont != NULL)
vos_cont_decref(cont);
return rc;
*empty_ret = false;
goto done;
}

*empty_ret = false;
Expand Down Expand Up @@ -781,6 +809,9 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)
if (cont != NULL)
vos_cont_decref(cont);

done:
gc_update_stats(pool);

return rc;
}

Expand Down Expand Up @@ -916,7 +947,7 @@ gc_have_pool(struct vos_pool *pool)
static void
gc_log_pool(struct vos_pool *pool)
{
struct vos_gc_stat *stat = &pool->vp_gc_stat;
struct vos_gc_stat *stat = &pool->vp_gc_stat_global;

D_DEBUG(DB_TRACE,
"Pool="DF_UUID", GC reclaimed:\n"
Expand Down Expand Up @@ -1106,6 +1137,9 @@ int
vos_gc_pool(daos_handle_t poh, int credits, int (*yield_func)(void *arg),
void *yield_arg)
{
struct d_tm_node_t *duration = NULL;
struct d_tm_node_t *tight = NULL;
struct d_tm_node_t *slack = NULL;
struct vos_pool *pool = vos_hdl2pool(poh);
struct vos_tls *tls = vos_tls_get(pool->vp_sysdb);
struct vos_gc_param param;
Expand All @@ -1131,24 +1165,44 @@ vos_gc_pool(daos_handle_t poh, int credits, int (*yield_func)(void *arg),

tls->vtl_gc_running++;

if (pool->vp_metrics != NULL) {
duration = pool->vp_metrics->vp_gc_metrics.vgm_duration;
slack = pool->vp_metrics->vp_gc_metrics.vgm_slack_cnt;
tight = pool->vp_metrics->vp_gc_metrics.vgm_tight_cnt;
}

while (1) {
int creds = param.vgc_credits;

d_tm_mark_duration_start(duration, D_TM_CLOCK_THREAD_CPUTIME);
if (creds == GC_CREDS_TIGHT)
d_tm_inc_counter(tight, 1);
else
d_tm_inc_counter(slack, 1);

if (credits > 0 && (credits - total) < creds)
creds = credits - total;

total += creds;
rc = vos_gc_pool_tight(poh, &creds);

if (rc) {
D_ERROR("GC pool failed: " DF_RC "\n", DP_RC(rc));
d_tm_mark_duration_end(duration);
break;
}
total -= creds; /* subtract the remainded credits */
if (creds != 0)
if (creds != 0) {
d_tm_mark_duration_end(duration);
break; /* reclaimed everything */
}

if (credits > 0 && total >= credits)
if (credits > 0 && total >= credits) {
d_tm_mark_duration_end(duration);
break; /* consumed all credits */
}

d_tm_mark_duration_end(duration);

if (vos_gc_yield(&param)) {
D_DEBUG(DB_TRACE, "GC pool run aborted\n");
Expand Down Expand Up @@ -1199,3 +1253,65 @@ vos_flush_pool(daos_handle_t poh, bool force, uint32_t nr_flush, uint32_t *nr_fl

return rc;
}

#define VOS_GC_DIR "vos_gc"
void
vos_gc_metrics_init(struct vos_gc_metrics *vgm, const char *path, int tgt_id)
{
int rc;

/* GC slice duration */
rc = d_tm_add_metric(&vgm->vgm_duration, D_TM_DURATION | D_TM_CLOCK_THREAD_CPUTIME,
"GC slice duration", NULL, "%s/%s/duration/tgt_%u", path, VOS_GC_DIR,
tgt_id);
if (rc)
D_WARN("Failed to create 'duration' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC container deletion */
rc = d_tm_add_metric(&vgm->vgm_cont_del, D_TM_COUNTER, "GC containers deleted", NULL,
"%s/%s/cont_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'cont_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC object deletion */
rc = d_tm_add_metric(&vgm->vgm_obj_del, D_TM_COUNTER, "GC objects deleted", NULL,
"%s/%s/obj_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'obj_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC dkey deletion */
rc = d_tm_add_metric(&vgm->vgm_dkey_del, D_TM_COUNTER, "GC dkeys deleted", NULL,
"%s/%s/dkey_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'dkey_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC akey deletion */
rc = d_tm_add_metric(&vgm->vgm_akey_del, D_TM_COUNTER, "GC akeys deleted", NULL,
"%s/%s/akey_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'akey_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC ev deletion */
rc = d_tm_add_metric(&vgm->vgm_ev_del, D_TM_COUNTER, "GC ev deleted", NULL,
"%s/%s/ev_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'ev_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC sv deletion */
rc = d_tm_add_metric(&vgm->vgm_sv_del, D_TM_COUNTER, "GC sv deleted", NULL,
"%s/%s/sv_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'sv_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC slack mode runs */
rc = d_tm_add_metric(&vgm->vgm_slack_cnt, D_TM_COUNTER, "GC slack mode count", NULL,
"%s/%s/slack_cnt/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'slack_cnt' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC tight mode runs */
rc = d_tm_add_metric(&vgm->vgm_tight_cnt, D_TM_COUNTER, "GC tight mode count", NULL,
"%s/%s/tight_cnt/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'tight_cnt' telemetry: " DF_RC "\n", DP_RC(rc));
}
20 changes: 20 additions & 0 deletions src/vos/vos_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,19 @@ struct vos_agg_metrics {
struct d_tm_node_t *vam_del_ev; /* Deleted EV records */
struct d_tm_node_t *vam_merge_recs; /* Total merged EV records */
struct d_tm_node_t *vam_merge_size; /* Total merged size */
struct d_tm_node_t *vam_fail_count; /* Aggregation failed */
};

struct vos_gc_metrics {
struct d_tm_node_t *vgm_duration; /* Duration of each gc scan */
struct d_tm_node_t *vgm_cont_del; /* containers reclaimed */
struct d_tm_node_t *vgm_obj_del; /* objects reclaimed */
struct d_tm_node_t *vgm_dkey_del; /* dkeys reclaimed */
struct d_tm_node_t *vgm_akey_del; /* akeys reclaimed */
struct d_tm_node_t *vgm_ev_del; /* EV records reclaimed */
struct d_tm_node_t *vgm_sv_del; /* SV records reclaimed */
struct d_tm_node_t *vgm_slack_cnt; /* Slack mode count */
struct d_tm_node_t *vgm_tight_cnt; /* Tight mode count */
};

/*
Expand All @@ -200,10 +213,14 @@ struct vos_chkpt_metrics {
};

void vos_chkpt_metrics_init(struct vos_chkpt_metrics *vc_metrics, const char *path, int tgt_id);
void
vos_gc_metrics_init(struct vos_gc_metrics *vc_metrics, const char *path, int tgt_id);

struct vos_space_metrics {
struct d_tm_node_t *vsm_scm_used; /* SCM space used */
struct d_tm_node_t *vsm_nvme_used; /* NVMe space used */
struct d_tm_node_t *vsm_scm_total; /* SCM space total */
struct d_tm_node_t *vsm_nvme_total; /* NVMe space total */
uint64_t vsm_last_update_ts; /* Timeout counter */
};

Expand All @@ -219,6 +236,7 @@ struct vos_rh_metrics {
struct vos_pool_metrics {
void *vp_vea_metrics;
struct vos_agg_metrics vp_agg_metrics;
struct vos_gc_metrics vp_gc_metrics;
struct vos_space_metrics vp_space_metrics;
struct vos_chkpt_metrics vp_chkpt_metrics;
struct vos_rh_metrics vp_rh_metrics;
Expand Down Expand Up @@ -255,6 +273,8 @@ struct vos_pool {
/** btr handle for the container table */
daos_handle_t vp_cont_th;
/** GC statistics of this pool */
struct vos_gc_stat vp_gc_stat_global;
/** GC per slice statistics of this pool */
struct vos_gc_stat vp_gc_stat;
/** link chain on vos_tls::vtl_gc_pools */
d_list_t vp_gc_link;
Expand Down
4 changes: 2 additions & 2 deletions src/vos/vos_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -1494,7 +1494,7 @@ vos_pool_query(daos_handle_t poh, vos_pool_info_t *pinfo)

D_ASSERT(pinfo != NULL);
pinfo->pif_cont_nr = pool_df->pd_cont_nr;
pinfo->pif_gc_stat = pool->vp_gc_stat;
pinfo->pif_gc_stat = pool->vp_gc_stat_global;

rc = vos_space_query(pool, &pinfo->pif_space, true);
if (rc)
Expand Down Expand Up @@ -1552,7 +1552,7 @@ vos_pool_ctl(daos_handle_t poh, enum vos_pool_opc opc, void *param)
default:
return -DER_NOSYS;
case VOS_PO_CTL_RESET_GC:
memset(&pool->vp_gc_stat, 0, sizeof(pool->vp_gc_stat));
memset(&pool->vp_gc_stat_global, 0, sizeof(pool->vp_gc_stat_global));
break;
case VOS_PO_CTL_SET_POLICY:
if (param == NULL)
Expand Down
6 changes: 6 additions & 0 deletions src/vos/vos_space.c
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,12 @@ vos_space_update_metrics(struct vos_pool *pool)
if (!vpm)
return;

if (vpm->vp_space_metrics.vsm_last_update_ts == 0) {
/* Set the constant values */
d_tm_set_gauge(vpm->vp_space_metrics.vsm_scm_total, pool->vp_pool_df->pd_scm_sz);
d_tm_set_gauge(vpm->vp_space_metrics.vsm_nvme_total, pool->vp_pool_df->pd_nvme_sz);
}

now = daos_gettime_coarse();
if (now < vpm->vp_space_metrics.vsm_last_update_ts + VOS_SPACE_METRICS_INTV) {
return;
Expand Down
Loading