Skip to content

Commit

Permalink
Miscellaneous test improvements, and better diagnostics before a fata…
Browse files Browse the repository at this point in the history
…l error in MvccManager::AddPending

Summary:
- Better diagnostics before a fatal error in MvccManager::AddPending>
- Print fully symbolized test failure stack trace to stderr before crashing for better test debugging.
- Do some regex replacements on symbolized stack trace lines for better readability.
- Post-process symbolized function names to make them more readable.
- Update BaseCQLTest to wait until `system.peers` contains enough entries before proceeding.
- Better failure reporting in TestRF1Cluster.
- Suppress UBSAN warnings in the AWS C++ client.
- Provide a way to customize sanitizer options for tests.
- Add retries to gen_version_info.py (useful when getting "Resource temporarily unavailable" on NFS).
- Do not fail if `~/.m2/settings.xml` is not present (unless we're running on Jenkins).
- `update_test_result_xml.py` should not try to parse XML files more than 16 MB in size.
- The `waitFor` function in Java tests now automatically applies the TSAN timeout multiplier.
- Print sanitizer-related environment variables at the top of the test log.
- Pass minicluster daemon id to yb-master and yb-tserver in tests.
- Minor improvements in MonoTime::ToString().
- Ability to run Java tests in `yb_build.sh`.

Test Plan: Jenkins

Reviewers: pritam.damania, hector, bharat, timur, sergei

Reviewed By: sergei

Subscribers: bharat, ybase

Differential Revision: https://phabricator.dev.yugabyte.com/D4307
  • Loading branch information
mbautin committed Mar 13, 2018
1 parent 41a2c40 commit 7147469
Show file tree
Hide file tree
Showing 29 changed files with 582 additions and 240 deletions.
13 changes: 10 additions & 3 deletions bin/repeat_unit_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Options:
be allowed.
--skip-log-compression
Don't compress kept logs.
--stop-at-failure
--stop-at-failure, --stop-on-failure
Stop running further iterations after the first failure happens.
EOT
}
Expand Down Expand Up @@ -146,10 +146,13 @@ while [[ $# -gt 0 ]]; do
fi
yb_compiler_type_arg="clang"
;;
--stop-at-failure)
--stop-at-failure|--stop-on-failure)
stop_at_failure=true
;;
*)
if [[ $1 == "--" ]]; then
fatal "'--' is not a valid positional argument, something is wrong."
fi
positional_args+=( "$1" )
;;
esac
Expand All @@ -163,7 +166,7 @@ fi

set_cmake_build_type_and_compiler_type
set_build_root
set_asan_tsan_runtime_options
set_sanitizer_runtime_options

declare -i -r num_pos_args=${#positional_args[@]}
if [[ $num_pos_args -lt 1 || $num_pos_args -gt 2 ]]; then
Expand Down Expand Up @@ -277,6 +280,10 @@ $RANDOM.$RANDOM.$RANDOM.$$
fi

echo "$pass_or_fail: iteration $iteration, $elapsed_time_sec sec$comment"

if [[ $pass_or_fail == "FAILED" ]]; then
touch "$failure_flag_file_path"
fi
else
if [[ -n $yb_compiler_type_from_env ]]; then
log "YB_COMPILER_TYPE env variable was set to '$yb_compiler_type_from_env' by the caller."
Expand Down
58 changes: 43 additions & 15 deletions build-support/common-test-env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ readonly RELEVANT_LOG_LINES_RE
# Some functions use this to output to stdout/stderr along with a file.
append_output_to=/dev/null

readonly ADDRESS_ALREADY_IN_USE_PATTERN="Address already in use"
readonly TEST_RESTART_PATTERN="Address already in use|\
pthread .*: Device or resource busy"

# We use this to submit test jobs for execution on Spark.
readonly SPARK_SUBMIT_CMD_PATH_NON_TSAN=/n/tools/spark/current/bin/spark-submit
Expand All @@ -76,6 +77,9 @@ readonly LIST_OF_TESTS_DIR_NAME="list_of_tests"

readonly JENKINS_NFS_BUILD_REPORT_BASE_DIR="/n/jenkins/build_stats"

# https://github.com/google/sanitizers/wiki/SanitizerCommonFlags
readonly SANITIZER_COMMON_OPTIONS=""

# -------------------------------------------------------------------------------------------------
# Functions
# -------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -239,6 +243,12 @@ collect_gtest_tests() {
gtest_list_tests_cmd+=( "--gtest_filter=$YB_GTEST_FILTER" )
fi

# We need to list the tests without any user-specified sanitizer options, because that might
# produce unintended output.
local old_sanitizer_extra_options=${YB_SANITIZER_EXTRA_OPTIONS:-}
local YB_SANITIZER_EXTRA_OPTIONS=""
set_sanitizer_runtime_options

set +e
pushd "$gtest_list_tests_tmp_dir" >/dev/null
local gtest_list_tests_result # this has to be on a separate line to capture the exit code
Expand All @@ -249,6 +259,10 @@ collect_gtest_tests() {
popd >/dev/null
set -e

YB_SANITIZER_EXTRA_OPTIONS=$old_sanitizer_extra_options
set_sanitizer_runtime_options


set_expected_core_dir "$gtest_list_tests_tmp_dir"
test_log_path="$gtest_list_stderr_path"
process_core_file
Expand Down Expand Up @@ -853,9 +867,9 @@ run_one_test() {
# See if the test failed due to "Address already in use" and log a message if we still have more
# attempts left.
if [[ $attempts_left -gt 0 ]] && \
egrep -q "$ADDRESS_ALREADY_IN_USE_PATTERN" "$test_log_path"; then
log "Found 'Address already in use' in test log, re-running the test:"
egrep "$ADDRESS_ALREADY_IN_USE_PATTERN" "$test_log_path" >&2
egrep -q "$TEST_RESTART_PATTERN" "$test_log_path"; then
log "Found one of the intermittent error patterns in the log, restarting the test (once):"
egrep "$TEST_RESTART_PATTERN" "$test_log_path" >&2
elif [[ $test_exit_code -ne 0 ]]; then
# Avoid retrying any other kinds of failures.
break
Expand Down Expand Up @@ -934,9 +948,16 @@ run_test_and_process_results() {
delete_successful_output_if_needed
}

set_asan_tsan_runtime_options() {
set_sanitizer_runtime_options() {
expect_vars_to_be_set BUILD_ROOT

# Don't allow setting these options directly from outside. We will allow controlling them through
# our own "extra options" environment variables.
export ASAN_OPTIONS=""
export TSAN_OPTIONS=""
export LSAN_OPTIONS=""
export UBSAN_OPTIONS=""

local -r build_root_basename=${BUILD_ROOT##*/}

# We don't add a hyphen in the end of the following regex, because there is a "tsan_slow" build
Expand Down Expand Up @@ -982,20 +1003,21 @@ set_asan_tsan_runtime_options() {
if [[ $build_root_basename =~ ^asan- ]]; then
# Enable leak detection even under LLVM 3.4, where it was disabled by default.
# This flag only takes effect when running an ASAN build.
ASAN_OPTIONS="${ASAN_OPTIONS:-} detect_leaks=1"
export ASAN_OPTIONS
export ASAN_OPTIONS="detect_leaks=1"

# Set up suppressions for LeakSanitizer
LSAN_OPTIONS="${LSAN_OPTIONS:-} suppressions=$YB_SRC_ROOT/build-support/lsan-suppressions.txt"
LSAN_OPTIONS="suppressions=$YB_SRC_ROOT/build-support/lsan-suppressions.txt"

# If we print out object addresses somewhere else, we can match them to LSAN-reported
# addresses of leaked objects.
LSAN_OPTIONS+=" report_objects=1"
export LSAN_OPTIONS

# Enable stack traces for UBSAN failures
UBSAN_OPTIONS="${UBSAN_OPTIONS:-} print_stacktrace=1"
UBSAN_OPTIONS="${UBSAN_OPTIONS} suppressions=$YB_SRC_ROOT/build-support/ubsan-suppressions.txt"
UBSAN_OPTIONS="print_stacktrace=1"
local ubsan_suppressions_path=$YB_SRC_ROOT/build-support/ubsan-suppressions.txt
ensure_file_exists "$ubsan_suppressions_path"
UBSAN_OPTIONS+=" suppressions=$ubsan_suppressions_path"
export UBSAN_OPTIONS
fi

Expand All @@ -1008,12 +1030,18 @@ set_asan_tsan_runtime_options() {
# needs compiler-rt commits c4c3dfd, 9a8efe3, and possibly others.
# 2. Many unit tests report lock-order-inversion warnings; they should be
# fixed before reenabling the detector.
TSAN_OPTIONS="${TSAN_OPTIONS:-} detect_deadlocks=0"
TSAN_OPTIONS="$TSAN_OPTIONS suppressions=$YB_SRC_ROOT/build-support/tsan-suppressions.txt"
TSAN_OPTIONS="$TSAN_OPTIONS history_size=7"
TSAN_OPTIONS="$TSAN_OPTIONS external_symbolizer_path=$ASAN_SYMBOLIZER_PATH"
TSAN_OPTIONS="detect_deadlocks=0"
TSAN_OPTIONS+=" suppressions=$YB_SRC_ROOT/build-support/tsan-suppressions.txt"
TSAN_OPTIONS+=" history_size=7"
TSAN_OPTIONS+=" external_symbolizer_path=$ASAN_SYMBOLIZER_PATH"
export TSAN_OPTIONS
fi

local extra_opts=${YB_SANITIZER_EXTRA_OPTIONS:-}
export ASAN_OPTIONS="$SANITIZER_COMMON_OPTIONS ${ASAN_OPTIONS:-} $extra_opts"
export LSAN_OPTIONS="$SANITIZER_COMMON_OPTIONS ${LSAN_OPTIONS:-} $extra_opts"
export TSAN_OPTIONS="$SANITIZER_COMMON_OPTIONS ${TSAN_OPTIONS:-} $extra_opts"
export UBSAN_OPTIONS="$SANITIZER_COMMON_OPTIONS ${UBSAN_OPTIONS:-} $extra_opts"
}

did_test_succeed() {
Expand Down Expand Up @@ -1045,7 +1073,7 @@ did_test_succeed() {
return 1
fi

if grep -q 'AddressSanitizer: undefined-behavior' "$log_path"; then
if grep -q '(AddressSanitizer|UndefinedBehaviorSanitizer): undefined-behavior' "$log_path"; then
log 'Detected undefined behavior'
return 1
fi
Expand Down
16 changes: 14 additions & 2 deletions build-support/gen_version_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,20 @@ def main():
}
content = json.dumps(data)

with file(output_path, "w") as f:
print >>f, content
# Frequently getting errors here when rebuilding on NFS:
# https://gist.githubusercontent.com/mbautin/572dc0ab6b9c269910c1a51f31d79b38/raw
attempts_left = 10
while attempts_left > 0:
try:
with file(output_path, "w") as f:
print >>f, content
break
except IOError, ex:
if attempts_left == 0:
raise ex
if 'Resource temporarily unavailable' in ex.message:
time.sleep(0.1)
attempts_left -= 1

return 0

Expand Down
2 changes: 1 addition & 1 deletion build-support/jenkins/build-and-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,7 @@ fi
# Run tests, either on Spark or locally.
# If YB_COMPILE_ONLY is set to 1, we skip running all tests (Java and C++).

set_asan_tsan_runtime_options
set_sanitizer_runtime_options

if [[ $YB_COMPILE_ONLY != "1" ]]; then
if spark_available; then
Expand Down
37 changes: 24 additions & 13 deletions build-support/run-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ if [[ $# -eq 2 && -d $YB_SRC_ROOT/java/$1 ]]; then
fi
set_common_test_paths
set_mvn_parameters
set_asan_tsan_runtime_options
set_sanitizer_runtime_options
mkdir -p "$YB_TEST_LOG_ROOT_DIR/java"

# This can't include $YB_TEST_INVOCATION_ID -- previously, when we did that, it looked like some
Expand All @@ -95,19 +95,30 @@ if [[ $# -eq 2 && -d $YB_SRC_ROOT/java/$1 ]]; then
surefire_rel_tmp_dir=surefire${timestamp}_${RANDOM}_${RANDOM}_${RANDOM}_$$

cd "$YB_SRC_ROOT/java"
set -x +e
# We specify tempDir to use a separate temporary directory for each test.
# http://maven.apache.org/surefire/maven-surefire-plugin/test-mojo.html
mvn -Dtest="$test_class" \
--projects "$module_name" \
--settings "$YB_MVN_SETTINGS_PATH" \
-DbinDir="$BUILD_ROOT/bin" \
-Dmaven.repo.local="$YB_MVN_LOCAL_REPO" \
-DtempDir="$surefire_rel_tmp_dir" \
-DskipAssembly \
-Dmaven.javadoc.skip \
-X \
surefire:test
mvn_options=(
-Dtest="$test_class"
--projects "$module_name"
-DbinDir="$BUILD_ROOT/bin"
-Dmaven.repo.local="$YB_MVN_LOCAL_REPO"
-DtempDir="$surefire_rel_tmp_dir"
-DskipAssembly
-Dmaven.javadoc.skip
)

if is_jenkins || \
[[ $YB_MVN_SETTINGS_PATH != "$HOME/.m2/settings.xml" ]] || \
[[ -f $YB_MVN_SETTINGS_PATH ]]; then
mvn_options+=( --settings "$YB_MVN_SETTINGS_PATH" )
fi

if is_jenkins; then
# When running on Jenkins we'd like to see more debug output.
mvn_options+=( -X )
fi

( set -x; mvn "${mvn_options[@]}" surefire:test )
# See the cleanup() function above for how we kill stuck processes based on the
# $YB_TEST_INVOCATION_ID pattern.
exit
Expand Down Expand Up @@ -165,7 +176,7 @@ TEST_NAME=${TEST_NAME_WITH_EXT%%.*}
TEST_DIR_BASENAME="$( basename "$TEST_DIR" )"
LOG_PATH_BASENAME_PREFIX=$TEST_NAME

set_asan_tsan_runtime_options
set_sanitizer_runtime_options

tests=()
rel_test_binary="$TEST_DIR_BASENAME/$TEST_NAME"
Expand Down
64 changes: 57 additions & 7 deletions build-support/ubsan-suppressions.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,59 @@
# False positive from libunwind
alignment:access_mem
# https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html#runtime-suppressions

# See ubsan_checks.inc from the UBSAN source for the list of possible UBSAN checks.
# The source code is at https://github.com/llvm-mirror/compiler-rt

# Here is a copy-and-paste for convenience:
# https://gist.githubusercontent.com/mbautin/fb08c1864783cfa410c16caf7c1b4833/raw

# It looks like in the suppression files we should use the last argument out of the three arguments
# to the UBSAN_CHECK macro. Here is a list for convenience.
#
# alignment
# bool
# bounds
# cfi
# enum
# float-cast-overflow
# float-divide-by-zero
# function
# integer-divide-by-zero
# invalid-builtin-use
# nonnull-attribute
# null
# object-size
# pointer-overflow
# return
# returns-nonnull-attribute
# shift-base
# shift-exponent
# signed-integer-overflow
# undefined
# unreachable
# unsigned-integer-overflow
# vla-bound
# vptr

# -------------------------------------------------------------------------------------------------
# UBSAN issues in the AWS C++ SDK.
# -------------------------------------------------------------------------------------------------

# TODO: verify if it is OK to ignore these.
# Tracked here: https://goo.gl/kPaUgS
signed-integer-overflow:Aws::Utils::HashingUtils::HashString

# Invalid bool load:
# https://gist.githubusercontent.com/mbautin/2d34b20f632b7efacac5541227579434/raw
# Happens in Aws::Utils::Outcome<Aws::Utils::Array<unsigned char>, bool>::operator=
#
# The error is:
# runtime error: load of value 156, which is not a valid value for type 'typename
# remove_reference<bool &>::type' (aka 'bool')
#
# Weirdly, even though 'bool' is mentioned above, we have to use 'enum' here.
enum:Aws::Utils::Outcome*operator=

# -------------------------------------------------------------------------------------------------

# crcutil works with unaligned data
alignment:crcutil::Crc32cSSE4::Crc32c
Expand All @@ -9,8 +63,4 @@ alignment:snappy::internal::CompressFragment
alignment:snappy::EmitLiteral
alignment:snappy::EmitCopyLessThan64
alignment:snappy::internal::CompressFragment

# TODO: verify if it is OK to ignore these.
# Tracked here: https://goo.gl/kPaUgS
signed-integer-overflow:Aws::Utils::HashingUtils::HashString
bool:Aws::Client::AWSAuthV4Signer::ComputeLongLivedHash
alignment:snappy::SnappyDecompressor::DecompressAllTags
18 changes: 17 additions & 1 deletion build-support/update_test_result_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
from xml.dom import minidom


MAX_RESULT_XML_SIZE_BYTES = 16 * 1024 * 1024


def main():
logging.basicConfig(
level=logging.INFO,
Expand Down Expand Up @@ -58,6 +61,13 @@ def main():
dest="mark_as_failed")

args = parser.parse_args()
result_xml_size = os.stat(args.result_xml).st_size
if result_xml_size > MAX_RESULT_XML_SIZE_BYTES:
logging.error(
"Result XML file size is more than max allowed size (%d bytes): %d bytes. "
"Refusing to parse this XML file and add the log path there.",
MAX_RESULT_XML_SIZE_BYTES, result_xml_size)
return False

# Basic JUnit XML format description:
#
Expand Down Expand Up @@ -126,6 +136,12 @@ def main():
with open(args.result_xml, 'w') as output_file:
output_file.write(output_xml_str)

return True


if __name__ == "__main__":
sys.exit(main())
if main():
exit_code = 0
else:
exit_code = 1
sys.exit(exit_code)
1 change: 1 addition & 0 deletions java/yb-client/src/test/java/org/yb/client/TestUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,7 @@ public interface Condition {
}

public static void waitFor(Condition condition, long timeoutMs) throws Exception {
timeoutMs *= getTimeoutMultiplier();
final long startTimeMs = System.currentTimeMillis();
while (System.currentTimeMillis() - startTimeMs < timeoutMs && !condition.get()) {
Thread.sleep(SLEEP_TIME_MS);
Expand Down
Loading

0 comments on commit 7147469

Please sign in to comment.