Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stats: convert tag extractor regexs to Re2 #14519

Merged
merged 26 commits into from
Jan 14, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
1215e11
stats: convert tag extractor regexs to Re2
rojkov Dec 18, 2020
f242af4
extend tag values to include dash and simplify regex for stat prefixes
rojkov Dec 28, 2020
b6865ad
drop optional matches since stat name is not muated until all extract…
rojkov Dec 28, 2020
230baaf
fix clang_tidy check
rojkov Dec 28, 2020
baee204
add reference results for the benchmark
rojkov Dec 28, 2020
12d7cc4
fix check_spelling_pedantic.py
rojkov Dec 28, 2020
c5561f6
fix clang_tidy check once again
rojkov Dec 28, 2020
bdf4545
update regex for tag values
rojkov Dec 29, 2020
b224e44
make regexes expandable with configurable patterns for sections
rojkov Dec 29, 2020
f2fb755
Merge remote-tracking branch 'upstream/master' into re2
rojkov Dec 29, 2020
78aefc7
simplify regex expansion
rojkov Dec 30, 2020
75b8697
Merge remote-tracking branch 'upstream/master' into re2
rojkov Dec 30, 2020
4c5e2d6
revert tag_extractor test to old tag values
rojkov Dec 31, 2020
6f66047
removed unused TagNameValues::addRegex()
rojkov Dec 31, 2020
b04be99
restore square brackets in comment
rojkov Dec 31, 2020
95aa344
Merge remote-tracking branch 'upstream/master' into re2
rojkov Dec 31, 2020
a833dcc
add comments to regexes
rojkov Jan 5, 2021
cc4d575
Merge remote-tracking branch 'upstream/master' into re2
rojkov Jan 5, 2021
9288659
add a comment explaining HTTP_CONN_MANAGER_PREFIX regex
rojkov Jan 5, 2021
1ecbac0
log tag extractor usage
rojkov Jan 8, 2021
f7c9045
put perf debug output under ifdef
rojkov Jan 8, 2021
d4d9553
make output more greppable with TagStats
rojkov Jan 11, 2021
7e4d7f2
Make expandRegex()'s comment more descriptive
rojkov Jan 11, 2021
618a203
amend comments and unify usage of symbol classes in regexes
rojkov Jan 12, 2021
0ad2460
simplify macros
rojkov Jan 12, 2021
910cb04
put destructor under explicit ifdef
rojkov Jan 14, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 40 additions & 37 deletions source/common/config/well_known_names.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,97 +24,100 @@ TagNameValues::TagNameValues() {
// - Typical * notation will be used to denote an arbitrary set of characters.

// *_rq(_<response_code>)
addRegex(RESPONSE_CODE, "_rq(_(\\d{3}))$", "_rq_");
addRe2(RESPONSE_CODE, R"(_rq(_(\d{3}))$)", "_rq_");

// *_rq_(<response_code_class>)xx
addRegex(RESPONSE_CODE_CLASS, "_rq_(\\d)xx$", "_rq_");
addRe2(RESPONSE_CODE_CLASS, R"(_rq_((\d))xx$)", "_rq_");

// http.[<stat_prefix>.]dynamodb.table.[<table_name>.]capacity.[<operation_name>.](__partition_id=<last_seven_characters_from_partition_id>)
addRegex(DYNAMO_PARTITION_ID,
"^http(?=\\.).*?\\.dynamodb\\.table(?=\\.).*?\\."
"capacity(?=\\.).*?(\\.__partition_id=(\\w{7}))$",
".dynamodb.table.");
addRe2(
DYNAMO_PARTITION_ID,
R"(^http\.(?:[\w-]+\.)*?dynamodb\.table\.(?:[\w-]+\.)?capacity\.[\w-]+(\.__partition_id=(\w{7}))$)",
".dynamodb.table.");

// http.[<stat_prefix>.]dynamodb.operation.(<operation_name>.)<base_stat> or
// http.[<stat_prefix>.]dynamodb.table.[<table_name>.]capacity.(<operation_name>.)[<partition_id>]
addRegex(DYNAMO_OPERATION,
"^http(?=\\.).*?\\.dynamodb.(?:operation|table(?="
"\\.).*?\\.capacity)(\\.(.*?))(?:\\.|$)",
".dynamodb.");
addRe2(
DYNAMO_OPERATION,
R"(^http\.(?:[\w-]+\.)*?dynamodb.(?:operation|table\.(?:[\w-]+\.)?capacity)(\.([\w-]+))(?:\.|$))",
".dynamodb.");

// mongo.[<stat_prefix>.]collection.[<collection>.]callsite.(<callsite>.)query.<base_stat>
addRegex(MONGO_CALLSITE,
R"(^mongo(?=\.).*?\.collection(?=\.).*?\.callsite\.((.*?)\.).*?query.\w+?$)",
".collection.");
addRe2(MONGO_CALLSITE,
R"(^mongo\.(?:[\w-]+\.)*?collection\.(?:[\w-]+\.)?callsite\.(([\w-]+)\.).*?query\.)",
".collection.");

// http.[<stat_prefix>.]dynamodb.table.(<table_name>.) or
// http.[<stat_prefix>.]dynamodb.error.(<table_name>.)*
addRegex(DYNAMO_TABLE, R"(^http(?=\.).*?\.dynamodb.(?:table|error)\.((.*?)\.))", ".dynamodb.");
addRe2(DYNAMO_TABLE, R"(^http\.(?:[\w-]+\.)*?dynamodb.(?:table|error)\.(([\w-]+)\.))",
".dynamodb.");

// mongo.[<stat_prefix>.]collection.(<collection>.)query.<base_stat>
addRegex(MONGO_COLLECTION, R"(^mongo(?=\.).*?\.collection\.((.*?)\.).*?query.\w+?$)",
".collection.");
addRe2(MONGO_COLLECTION, R"(^mongo\.(?:[\w-]+\.)*?collection\.(([\w-]+)\.).*?query\.)",
".collection.");

// mongo.[<stat_prefix>.]cmd.(<cmd>.)<base_stat>
addRegex(MONGO_CMD, R"(^mongo(?=\.).*?\.cmd\.((.*?)\.)\w+?$)", ".cmd.");
htuch marked this conversation as resolved.
Show resolved Hide resolved
addRe2(MONGO_CMD, R"(^mongo\.(?:[\w-]+\.)*?cmd\.(([\w-]+)\.))", ".cmd.");

// cluster.[<route_target_cluster>.]grpc.[<grpc_service>.](<grpc_method>.)<base_stat>
addRegex(GRPC_BRIDGE_METHOD, R"(^cluster(?=\.).*?\.grpc(?=\.).*\.((.*?)\.)\w+?$)", ".grpc.");
addRe2(GRPC_BRIDGE_METHOD, R"(^cluster\.(?:[\w-]+\.)?grpc\.(?:[\w-]+\.)?(([\w-]+)\.))", ".grpc.");

// http.[<stat_prefix>.]user_agent.(<user_agent>.)<base_stat>
addRegex(HTTP_USER_AGENT, R"(^http(?=\.).*?\.user_agent\.((.*?)\.)\w+?$)", ".user_agent.");
addRe2(HTTP_USER_AGENT, R"(^http\.(?:[\w-]+\.)*?user_agent\.(([\w-]+)\.))", ".user_agent.");

// vhost.[<virtual host name>.]vcluster.(<virtual_cluster_name>.)<base_stat>
addRegex(VIRTUAL_CLUSTER, R"(^vhost(?=\.).*?\.vcluster\.((.*?)\.)\w+?$)", ".vcluster.");
htuch marked this conversation as resolved.
Show resolved Hide resolved
addRe2(VIRTUAL_CLUSTER, R"(^vhost\.(?:[\w-]+\.)*?vcluster\.(([\w-]+)\.))", ".vcluster.");

// http.[<stat_prefix>.]fault.(<downstream_cluster>.)<base_stat>
addRegex(FAULT_DOWNSTREAM_CLUSTER, R"(^http(?=\.).*?\.fault\.((.*?)\.)\w+?$)", ".fault.");
addRe2(FAULT_DOWNSTREAM_CLUSTER, R"(^http\.(?:[\w-]+\.)*?fault\.(([\w-]+)\.))", ".fault.");

// listener.[<address>.]ssl.cipher.(<cipher>)
addRegex(SSL_CIPHER, R"(^listener(?=\.).*?\.ssl\.cipher(\.(.*?))$)");
addRe2(SSL_CIPHER, R"(^listener\.(?:.*?\.)?ssl\.cipher(\.([0-9A-Za-z_-]+))$)");

// cluster.[<cluster_name>.]ssl.ciphers.(<cipher>)
addRegex(SSL_CIPHER_SUITE, R"(^cluster(?=\.).*?\.ssl\.ciphers(\.(.*?))$)", ".ssl.ciphers.");
addRe2(SSL_CIPHER_SUITE, R"(^cluster\.(?:[\w-]+\.)?ssl\.ciphers(\.([0-9A-Za-z_-]+))$)",
".ssl.ciphers.");

// cluster.[<route_target_cluster>.]grpc.(<grpc_service>.)*
addRegex(GRPC_BRIDGE_SERVICE, R"(^cluster(?=\.).*?\.grpc\.((.*?)\.))", ".grpc.");
addRe2(GRPC_BRIDGE_SERVICE, R"(^cluster\.(?:[\w-]+\.)?grpc\.(([\w-]+)\.))", ".grpc.");

// tcp.(<stat_prefix>.)<base_stat>
addRegex(TCP_PREFIX, R"(^tcp\.((.*?)\.)\w+?$)");
addRe2(TCP_PREFIX, R"(^tcp\.(([\w-]+)\.)+?\w+?$)");

// udp.(<stat_prefix>.)<base_stat>
addRegex(UDP_PREFIX, R"(^udp\.((.*?)\.)\w+?$)");
addRe2(UDP_PREFIX, R"(^udp\.(([\w-]+)\.)+?\w+?$)");

// auth.clientssl.(<stat_prefix>.)<base_stat>
addRegex(CLIENTSSL_PREFIX, R"(^auth\.clientssl\.((.*?)\.)\w+?$)");
addRe2(CLIENTSSL_PREFIX, R"(^auth\.clientssl\.(([\w-]+)\.)+?\w+?$)");

// ratelimit.(<stat_prefix>.)<base_stat>
addRegex(RATELIMIT_PREFIX, R"(^ratelimit\.((.*?)\.)\w+?$)");
addRe2(RATELIMIT_PREFIX, R"(^ratelimit\.(([\w-]+)\.)+?\w+?$)");

// cluster.(<cluster_name>.)*
addRe2(CLUSTER_NAME, "^cluster\\.(([^\\.]+)\\.).*");
addRe2(CLUSTER_NAME, R"(^cluster\.(([\w-]+)\.))");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a question on the use of \w here -- does this possibly reject expressions that the original one did not? I think the answer is "yes". E.g. punctuation other than "." or "-" that might be expected to work. I think cluster-names are in control of the user, so this might be a visible change in behavior.

Is there a speed difference for using \w- vs ^\. ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^\. seems to perform better. Perhaps this is a negated comparison with a single symbol vs comparisons with multiple symbols. Replaced with ^\. everywhere.


// listener.[<address>.]http.(<stat_prefix>.)*
addRegex(HTTP_CONN_MANAGER_PREFIX, R"(^listener(?=\.).*?\.http\.((.*?)\.))", ".http.");
addRe2(HTTP_CONN_MANAGER_PREFIX, R"(^listener\.(?:.*?\.)?http\.(([\w-]+)\.))", ".http.");

// http.(<stat_prefix>.)*
addRegex(HTTP_CONN_MANAGER_PREFIX, "^http\\.((.*?)\\.)");
addRe2(HTTP_CONN_MANAGER_PREFIX, R"(^http\.(([\w-]+)\.))");
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, is it OK to have two patterns with the same name like in the lines 100 and 103?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. I think the semantics of this pre-date my involvement with this code, but it relates to runtime configuration of stat tag extraction regexes.

Regardless, this situation is pre-existing and it doesn't look like you've caused it to regress.


// listener.(<address>.)*
addRegex(LISTENER_ADDRESS,
R"(^listener\.(((?:[_.[:digit:]]*|[_\[\]aAbBcCdDeEfF[:digit:]]*))\.))");
addRe2(
LISTENER_ADDRESS,
R"(^listener\.(((?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}_\d+|\[[_aAbBcCdDeEfF[:digit:]]+\]_\d+))\.))");

// vhost.(<virtual host name>.)*
addRegex(VIRTUAL_HOST, "^vhost\\.((.*?)\\.)");
addRe2(VIRTUAL_HOST, R"(^vhost\.(([\w-]+)\.))");

// mongo.(<stat_prefix>.)*
addRegex(MONGO_PREFIX, "^mongo\\.((.*?)\\.)");
addRe2(MONGO_PREFIX, R"(^mongo\.(([\w-]+)\.))");

// http.[<stat_prefix>.]rds.(<route_config_name>.)<base_stat>
addRegex(RDS_ROUTE_CONFIG, R"(^http(?=\.).*?\.rds\.((.*?)\.)\w+?$)", ".rds.");
addRe2(RDS_ROUTE_CONFIG, R"(^http\.(?:[\w-]+\.)*?rds\.(([\w-\.]+)\.)\w+?$)", ".rds.");

// listener_manager.(worker_<id>.)*
addRegex(WORKER_ID, R"(^listener_manager\.((worker_\d+)\.))", "listener_manager.worker_");
addRe2(WORKER_ID, R"(^listener_manager\.((worker_\d+)\.))", "listener_manager.worker_");
}

void TagNameValues::addRegex(const std::string& name, const std::string& regex,
Expand Down
4 changes: 2 additions & 2 deletions source/common/stats/tag_extractor_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ bool TagExtractorRe2Impl::extractTag(absl::string_view stat_name, std::vector<Ta
re2::StringPiece remove_subexpr, value_subexpr;

// The regex must match and contain one or more subexpressions (all after the first are ignored).
if (re2::RE2::FullMatch(re2::StringPiece(stat_name.data(), stat_name.size()), regex_,
&remove_subexpr, &value_subexpr) &&
if (re2::RE2::PartialMatch(re2::StringPiece(stat_name.data(), stat_name.size()), regex_,
&remove_subexpr, &value_subexpr) &&
!remove_subexpr.empty()) {

// value_subexpr is the optional second submatch. It is usually inside the first submatch
Expand Down
19 changes: 19 additions & 0 deletions test/common/stats/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,25 @@ envoy_cc_test(
],
)

envoy_cc_benchmark_binary(
name = "tag_extractor_impl_benchmark",
srcs = [
"tag_extractor_impl_speed_test.cc",
],
external_deps = [
"benchmark",
],
deps = [
"//source/common/stats:tag_producer_lib",
"@envoy_api//envoy/config/metrics/v3:pkg_cc_proto",
],
)

envoy_benchmark_test(
name = "tag_extractor_impl_benchmark_test",
benchmark_binary = "tag_extractor_impl_benchmark",
)

envoy_cc_test(
name = "thread_local_store_test",
srcs = ["thread_local_store_test.cc"],
Expand Down
69 changes: 69 additions & 0 deletions test/common/stats/tag_extractor_impl_speed_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#include "envoy/config/metrics/v3/stats.pb.h"

#include "common/common/assert.h"
#include "common/config/well_known_names.h"
#include "common/stats/tag_producer_impl.h"

#include "benchmark/benchmark.h"

namespace Envoy {
namespace Stats {
namespace {

using Params = std::tuple<std::string, uint32_t>;

const std::vector<Params> params = {
{"listener.127.0.0.1_3012.http.http_prefix.downstream_rq_5xx", 3},
{"cluster.ratelimit.upstream_rq_timeout", 1},
{"listener.[__1]_0.ssl.cipher.AES256-SHA", 2},
{"cluster.ratelimit.ssl.ciphers.ECDHE-RSA-AES128-GCM-SHA256", 2},
{"listener.[2001_0db8_85a3_0000_0000_8a2e_0370_7334]_3543.ssl.cipher.AES256-SHA", 2},
{"listener.127.0.0.1_0.ssl.cipher.AES256-SHA", 2},
{"mongo.mongo_filter.op_reply", 1},
{"mongo.mongo_filter.cmd.foo_cmd.reply_size", 2},
{"mongo.mongo_filter.collection.bar_collection.query.multi_get", 2},
{"mongo.mongo_filter.collection.bar_collection.callsite.baz_callsite.query.scatter_get", 3},
{"mongo.mongo_filter.with.dot.in.prefix.collection.bar_collection.callsite.baz_callsite.query."
"scatter_get",
3},
{"ratelimit.foo_ratelimiter.over_limit", 1},
{"http.egress_dynamodb_iad.downstream_cx_total", 1},
{"http.egress_dynamodb_iad.dynamodb.operation.Query.upstream_rq_time", 2},
{"http.egress_dynamodb_iad.dynamodb.table.bar_table.upstream_rq_time", 2},
{"http.egress_dynamodb_iad.dynamodb.table.bar_table.capacity.Query.__partition_id=ABC1234", 4},
{"cluster.grpc_cluster.grpc.grpc_service_1.grpc_method_1.success", 3},
{"vhost.vhost_1.vcluster.vcluster_1.upstream_rq_2xx", 3},
{"vhost.vhost_1.vcluster.vcluster_1.upstream_rq_200", 3},
{"http.egress_dynamodb_iad.user_agent.ios.downstream_cx_total", 2},
{"auth.clientssl.clientssl_prefix.auth_ip_allowlist", 1},
{"tcp.tcp_prefix.downstream_flow_control_resumed_reading_total", 1},
{"udp.udp_prefix-with-dashes.downstream_flow_control_resumed_reading_total", 1},
{"http.fault_connection_manager.fault.fault_cluster.aborts_injected", 2},
{"http.rds_connection_manager.rds.route_config.123.update_success", 2},
{"listener_manager.worker_123.dispatcher.loop_duration_us", 1},
{"mongo_mongo_mongo_mongo.this_is_rather_long_string_which "
"does_not_match_and_consumes_a_lot_in_case_of_backtracking_imposed_by_greedy_pattern",
0},
{"another_long_but_matching_string_which_may_consume_resources_if_missing_end_of_line_lock_rq_"
"2xx",
1},
};

static void bmExtractTags(benchmark::State& state) {
TagProducerImpl tag_extractors{envoy::config::metrics::v3::StatsConfig()};
const auto idx = state.range(0);
const auto& p = params[idx];
absl::string_view str = std::get<0>(p);
const uint32_t tags_size = std::get<1>(p);

for (auto _ : state) {
jmarantz marked this conversation as resolved.
Show resolved Hide resolved
TagVector tags;
tag_extractors.produceTags(str, tags);
RELEASE_ASSERT(tags.size() == tags_size, "");
}
}
BENCHMARK(bmExtractTags)->DenseRange(0, 27, 1);

} // namespace
} // namespace Stats
} // namespace Envoy