Miscellaneous test improvements, and better diagnostics before a fata…

…l error in MvccManager::AddPending Summary: - Better diagnostics before a fatal error in MvccManager::AddPending> - Print fully symbolized test failure stack trace to stderr before crashing for better test debugging. - Do some regex replacements on symbolized stack trace lines for better readability. - Post-process symbolized function names to make them more readable. - Update BaseCQLTest to wait until `system.peers` contains enough entries before proceeding. - Better failure reporting in TestRF1Cluster. - Suppress UBSAN warnings in the AWS C++ client. - Provide a way to customize sanitizer options for tests. - Add retries to gen_version_info.py (useful when getting "Resource temporarily unavailable" on NFS). - Do not fail if `~/.m2/settings.xml` is not present (unless we're running on Jenkins). - `update_test_result_xml.py` should not try to parse XML files more than 16 MB in size. - The `waitFor` function in Java tests now automatically applies the TSAN timeout multiplier. - Print sanitizer-related environment variables at the top of the test log. - Pass minicluster daemon id to yb-master and yb-tserver in tests. - Minor improvements in MonoTime::ToString(). - Ability to run Java tests in `yb_build.sh`. Test Plan: Jenkins Reviewers: pritam.damania, hector, bharat, timur, sergei Reviewed By: sergei Subscribers: bharat, ybase Differential Revision: https://phabricator.dev.yugabyte.com/D4307
yugabyte · Mar 13, 2018 · 7147469 · 7147469
1 parent 41a2c40
commit 7147469
Show file tree

Hide file tree

Showing 29 changed files with 582 additions and 240 deletions.
diff --git a/bin/repeat_unit_test.sh b/bin/repeat_unit_test.sh
@@ -45,7 +45,7 @@ Options:
     be allowed.
   --skip-log-compression
     Don't compress kept logs.
-  --stop-at-failure
+  --stop-at-failure, --stop-on-failure
     Stop running further iterations after the first failure happens.
 EOT
 }
@@ -146,10 +146,13 @@ while [[ $# -gt 0 ]]; do
       fi
       yb_compiler_type_arg="clang"
     ;;
-    --stop-at-failure)
+    --stop-at-failure|--stop-on-failure)
       stop_at_failure=true
     ;;
     *)
+      if [[ $1 == "--" ]]; then
+        fatal "'--' is not a valid positional argument, something is wrong."
+      fi
       positional_args+=( "$1" )
     ;;
   esac
@@ -163,7 +166,7 @@ fi
 
 set_cmake_build_type_and_compiler_type
 set_build_root
-set_asan_tsan_runtime_options
+set_sanitizer_runtime_options
 
 declare -i -r num_pos_args=${#positional_args[@]}
 if [[ $num_pos_args -lt 1 || $num_pos_args -gt 2 ]]; then
@@ -277,6 +280,10 @@ $RANDOM.$RANDOM.$RANDOM.$$
   fi
 
   echo "$pass_or_fail: iteration $iteration, $elapsed_time_sec sec$comment"
+
+  if [[ $pass_or_fail == "FAILED" ]]; then
+    touch "$failure_flag_file_path"
+  fi
 else
   if [[ -n $yb_compiler_type_from_env ]]; then
     log "YB_COMPILER_TYPE env variable was set to '$yb_compiler_type_from_env' by the caller."

diff --git a/build-support/common-test-env.sh b/build-support/common-test-env.sh
@@ -56,7 +56,8 @@ readonly RELEVANT_LOG_LINES_RE
 # Some functions use this to output to stdout/stderr along with a file.
 append_output_to=/dev/null
 
-readonly ADDRESS_ALREADY_IN_USE_PATTERN="Address already in use"
+readonly TEST_RESTART_PATTERN="Address already in use|\
+pthread .*: Device or resource busy"
 
 # We use this to submit test jobs for execution on Spark.
 readonly SPARK_SUBMIT_CMD_PATH_NON_TSAN=/n/tools/spark/current/bin/spark-submit
@@ -76,6 +77,9 @@ readonly LIST_OF_TESTS_DIR_NAME="list_of_tests"
 
 readonly JENKINS_NFS_BUILD_REPORT_BASE_DIR="/n/jenkins/build_stats"
 
+# https://github.com/google/sanitizers/wiki/SanitizerCommonFlags
+readonly SANITIZER_COMMON_OPTIONS=""
+
 # -------------------------------------------------------------------------------------------------
 # Functions
 # -------------------------------------------------------------------------------------------------
@@ -239,6 +243,12 @@ collect_gtest_tests() {
     gtest_list_tests_cmd+=( "--gtest_filter=$YB_GTEST_FILTER" )
   fi
 
+  # We need to list the tests without any user-specified sanitizer options, because that might
+  # produce unintended output.
+  local old_sanitizer_extra_options=${YB_SANITIZER_EXTRA_OPTIONS:-}
+  local YB_SANITIZER_EXTRA_OPTIONS=""
+  set_sanitizer_runtime_options
+
   set +e
   pushd "$gtest_list_tests_tmp_dir" >/dev/null
   local gtest_list_tests_result  # this has to be on a separate line to capture the exit code
@@ -249,6 +259,10 @@ collect_gtest_tests() {
   popd >/dev/null
   set -e
 
+  YB_SANITIZER_EXTRA_OPTIONS=$old_sanitizer_extra_options
+  set_sanitizer_runtime_options
+
+
   set_expected_core_dir "$gtest_list_tests_tmp_dir"
   test_log_path="$gtest_list_stderr_path"
   process_core_file
@@ -853,9 +867,9 @@ run_one_test() {
     # See if the test failed due to "Address already in use" and log a message if we still have more
     # attempts left.
     if [[ $attempts_left -gt 0 ]] && \
-       egrep -q "$ADDRESS_ALREADY_IN_USE_PATTERN" "$test_log_path"; then
-      log "Found 'Address already in use' in test log, re-running the test:"
-      egrep "$ADDRESS_ALREADY_IN_USE_PATTERN" "$test_log_path" >&2
+       egrep -q "$TEST_RESTART_PATTERN" "$test_log_path"; then
+      log "Found one of the intermittent error patterns in the log, restarting the test (once):"
+      egrep "$TEST_RESTART_PATTERN" "$test_log_path" >&2
     elif [[ $test_exit_code -ne 0 ]]; then
       # Avoid retrying any other kinds of failures.
       break
@@ -934,9 +948,16 @@ run_test_and_process_results() {
   delete_successful_output_if_needed
 }
 
-set_asan_tsan_runtime_options() {
+set_sanitizer_runtime_options() {
   expect_vars_to_be_set BUILD_ROOT
 
+  # Don't allow setting these options directly from outside. We will allow controlling them through
+  # our own "extra options" environment variables.
+  export ASAN_OPTIONS=""
+  export TSAN_OPTIONS=""
+  export LSAN_OPTIONS=""
+  export UBSAN_OPTIONS=""
+
   local -r build_root_basename=${BUILD_ROOT##*/}
 
   # We don't add a hyphen in the end of the following regex, because there is a "tsan_slow" build
@@ -982,20 +1003,21 @@ set_asan_tsan_runtime_options() {
   if [[ $build_root_basename =~ ^asan- ]]; then
     # Enable leak detection even under LLVM 3.4, where it was disabled by default.
     # This flag only takes effect when running an ASAN build.
-    ASAN_OPTIONS="${ASAN_OPTIONS:-} detect_leaks=1"
-    export ASAN_OPTIONS
+    export ASAN_OPTIONS="detect_leaks=1"
 
     # Set up suppressions for LeakSanitizer
-    LSAN_OPTIONS="${LSAN_OPTIONS:-} suppressions=$YB_SRC_ROOT/build-support/lsan-suppressions.txt"
+    LSAN_OPTIONS="suppressions=$YB_SRC_ROOT/build-support/lsan-suppressions.txt"
 
     # If we print out object addresses somewhere else, we can match them to LSAN-reported
     # addresses of leaked objects.
     LSAN_OPTIONS+=" report_objects=1"
     export LSAN_OPTIONS
 
     # Enable stack traces for UBSAN failures
-    UBSAN_OPTIONS="${UBSAN_OPTIONS:-} print_stacktrace=1"
-    UBSAN_OPTIONS="${UBSAN_OPTIONS} suppressions=$YB_SRC_ROOT/build-support/ubsan-suppressions.txt"
+    UBSAN_OPTIONS="print_stacktrace=1"
+    local ubsan_suppressions_path=$YB_SRC_ROOT/build-support/ubsan-suppressions.txt
+    ensure_file_exists "$ubsan_suppressions_path"
+    UBSAN_OPTIONS+=" suppressions=$ubsan_suppressions_path"
     export UBSAN_OPTIONS
   fi
 
@@ -1008,12 +1030,18 @@ set_asan_tsan_runtime_options() {
     #    needs compiler-rt commits c4c3dfd, 9a8efe3, and possibly others.
     # 2. Many unit tests report lock-order-inversion warnings; they should be
     #    fixed before reenabling the detector.
-    TSAN_OPTIONS="${TSAN_OPTIONS:-} detect_deadlocks=0"
-    TSAN_OPTIONS="$TSAN_OPTIONS suppressions=$YB_SRC_ROOT/build-support/tsan-suppressions.txt"
-    TSAN_OPTIONS="$TSAN_OPTIONS history_size=7"
-    TSAN_OPTIONS="$TSAN_OPTIONS external_symbolizer_path=$ASAN_SYMBOLIZER_PATH"
+    TSAN_OPTIONS="detect_deadlocks=0"
+    TSAN_OPTIONS+=" suppressions=$YB_SRC_ROOT/build-support/tsan-suppressions.txt"
+    TSAN_OPTIONS+=" history_size=7"
+    TSAN_OPTIONS+=" external_symbolizer_path=$ASAN_SYMBOLIZER_PATH"
     export TSAN_OPTIONS
   fi
+
+  local extra_opts=${YB_SANITIZER_EXTRA_OPTIONS:-}
+  export ASAN_OPTIONS="$SANITIZER_COMMON_OPTIONS ${ASAN_OPTIONS:-} $extra_opts"
+  export LSAN_OPTIONS="$SANITIZER_COMMON_OPTIONS ${LSAN_OPTIONS:-} $extra_opts"
+  export TSAN_OPTIONS="$SANITIZER_COMMON_OPTIONS ${TSAN_OPTIONS:-} $extra_opts"
+  export UBSAN_OPTIONS="$SANITIZER_COMMON_OPTIONS ${UBSAN_OPTIONS:-} $extra_opts"
 }
 
 did_test_succeed() {
@@ -1045,7 +1073,7 @@ did_test_succeed() {
     return 1
   fi
 
-  if grep -q 'AddressSanitizer: undefined-behavior' "$log_path"; then
+  if grep -q '(AddressSanitizer|UndefinedBehaviorSanitizer): undefined-behavior' "$log_path"; then
     log 'Detected undefined behavior'
     return 1
   fi

diff --git a/build-support/gen_version_info.py b/build-support/gen_version_info.py
@@ -118,8 +118,20 @@ def main():
             }
     content = json.dumps(data)
 
-    with file(output_path, "w") as f:
-        print >>f, content
+    # Frequently getting errors here when rebuilding on NFS:
+    # https://gist.githubusercontent.com/mbautin/572dc0ab6b9c269910c1a51f31d79b38/raw
+    attempts_left = 10
+    while attempts_left > 0:
+        try:
+            with file(output_path, "w") as f:
+                print >>f, content
+            break
+        except IOError, ex:
+            if attempts_left == 0:
+                raise ex
+            if 'Resource temporarily unavailable' in ex.message:
+                time.sleep(0.1)
+        attempts_left -= 1
 
     return 0
 

diff --git a/build-support/jenkins/build-and-test.sh b/build-support/jenkins/build-and-test.sh
@@ -535,7 +535,7 @@ fi
 # Run tests, either on Spark or locally.
 # If YB_COMPILE_ONLY is set to 1, we skip running all tests (Java and C++).
 
-set_asan_tsan_runtime_options
+set_sanitizer_runtime_options
 
 if [[ $YB_COMPILE_ONLY != "1" ]]; then
   if spark_available; then

diff --git a/build-support/run-test.sh b/build-support/run-test.sh
@@ -85,7 +85,7 @@ if [[ $# -eq 2 && -d $YB_SRC_ROOT/java/$1 ]]; then
   fi
   set_common_test_paths
   set_mvn_parameters
-  set_asan_tsan_runtime_options
+  set_sanitizer_runtime_options
   mkdir -p "$YB_TEST_LOG_ROOT_DIR/java"
 
   # This can't include $YB_TEST_INVOCATION_ID -- previously, when we did that, it looked like some
@@ -95,19 +95,30 @@ if [[ $# -eq 2 && -d $YB_SRC_ROOT/java/$1 ]]; then
   surefire_rel_tmp_dir=surefire${timestamp}_${RANDOM}_${RANDOM}_${RANDOM}_$$
 
   cd "$YB_SRC_ROOT/java"
-  set -x +e
   # We specify tempDir to use a separate temporary directory for each test.
   # http://maven.apache.org/surefire/maven-surefire-plugin/test-mojo.html
-  mvn -Dtest="$test_class" \
-    --projects "$module_name" \
-    --settings "$YB_MVN_SETTINGS_PATH" \
-    -DbinDir="$BUILD_ROOT/bin" \
-    -Dmaven.repo.local="$YB_MVN_LOCAL_REPO" \
-    -DtempDir="$surefire_rel_tmp_dir" \
-    -DskipAssembly \
-    -Dmaven.javadoc.skip \
-    -X \
-    surefire:test
+  mvn_options=(
+    -Dtest="$test_class"
+    --projects "$module_name"
+    -DbinDir="$BUILD_ROOT/bin"
+    -Dmaven.repo.local="$YB_MVN_LOCAL_REPO"
+    -DtempDir="$surefire_rel_tmp_dir"
+    -DskipAssembly
+    -Dmaven.javadoc.skip
+  )
+
+  if is_jenkins || \
+     [[ $YB_MVN_SETTINGS_PATH != "$HOME/.m2/settings.xml" ]] || \
+     [[ -f $YB_MVN_SETTINGS_PATH ]]; then
+    mvn_options+=( --settings "$YB_MVN_SETTINGS_PATH" )
+  fi
+
+  if is_jenkins; then
+    # When running on Jenkins we'd like to see more debug output.
+    mvn_options+=( -X )
+  fi
+
+  ( set -x; mvn "${mvn_options[@]}" surefire:test )
   # See the cleanup() function above for how we kill stuck processes based on the
   # $YB_TEST_INVOCATION_ID pattern.
   exit
@@ -165,7 +176,7 @@ TEST_NAME=${TEST_NAME_WITH_EXT%%.*}
 TEST_DIR_BASENAME="$( basename "$TEST_DIR" )"
 LOG_PATH_BASENAME_PREFIX=$TEST_NAME
 
-set_asan_tsan_runtime_options
+set_sanitizer_runtime_options
 
 tests=()
 rel_test_binary="$TEST_DIR_BASENAME/$TEST_NAME"

diff --git a/build-support/ubsan-suppressions.txt b/build-support/ubsan-suppressions.txt
@@ -1,5 +1,59 @@
-# False positive from libunwind
-alignment:access_mem
+# https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html#runtime-suppressions
+
+# See ubsan_checks.inc from the UBSAN source for the list of possible UBSAN checks.
+# The source code is at https://github.com/llvm-mirror/compiler-rt
+
+# Here is a copy-and-paste for convenience:
+# https://gist.githubusercontent.com/mbautin/fb08c1864783cfa410c16caf7c1b4833/raw
+
+# It looks like in the suppression files we should use the last argument out of the three  arguments
+# to the UBSAN_CHECK macro. Here is a list for convenience.
+#
+# alignment
+# bool
+# bounds
+# cfi
+# enum
+# float-cast-overflow
+# float-divide-by-zero
+# function
+# integer-divide-by-zero
+# invalid-builtin-use
+# nonnull-attribute
+# null
+# object-size
+# pointer-overflow
+# return
+# returns-nonnull-attribute
+# shift-base
+# shift-exponent
+# signed-integer-overflow
+# undefined
+# unreachable
+# unsigned-integer-overflow
+# vla-bound
+# vptr
+
+# -------------------------------------------------------------------------------------------------
+# UBSAN issues in the AWS C++ SDK.
+# -------------------------------------------------------------------------------------------------
+
+# TODO: verify if it is OK to ignore these.
+# Tracked here: https://goo.gl/kPaUgS
+signed-integer-overflow:Aws::Utils::HashingUtils::HashString
+
+# Invalid bool load:
+# https://gist.githubusercontent.com/mbautin/2d34b20f632b7efacac5541227579434/raw
+# Happens in Aws::Utils::Outcome<Aws::Utils::Array<unsigned char>, bool>::operator=
+#
+# The error is:
+# runtime error: load of value 156, which is not a valid value for type 'typename
+# remove_reference<bool &>::type' (aka 'bool')
+#
+# Weirdly, even though 'bool' is mentioned above, we have to use 'enum' here.
+enum:Aws::Utils::Outcome*operator=
+
+# -------------------------------------------------------------------------------------------------
 
 # crcutil works with unaligned data
 alignment:crcutil::Crc32cSSE4::Crc32c
@@ -9,8 +63,4 @@ alignment:snappy::internal::CompressFragment
 alignment:snappy::EmitLiteral
 alignment:snappy::EmitCopyLessThan64
 alignment:snappy::internal::CompressFragment
-
-# TODO: verify if it is OK to ignore these.
-# Tracked here: https://goo.gl/kPaUgS
-signed-integer-overflow:Aws::Utils::HashingUtils::HashString
-bool:Aws::Client::AWSAuthV4Signer::ComputeLongLivedHash
+alignment:snappy::SnappyDecompressor::DecompressAllTags
diff --git a/build-support/update_test_result_xml.py b/build-support/update_test_result_xml.py
@@ -21,6 +21,9 @@
 from xml.dom import minidom
 
 
+MAX_RESULT_XML_SIZE_BYTES = 16 * 1024 * 1024
+
+
 def main():
     logging.basicConfig(
         level=logging.INFO,
@@ -58,6 +61,13 @@ def main():
         dest="mark_as_failed")
 
     args = parser.parse_args()
+    result_xml_size = os.stat(args.result_xml).st_size
+    if result_xml_size > MAX_RESULT_XML_SIZE_BYTES:
+        logging.error(
+            "Result XML file size is more than max allowed size (%d bytes): %d bytes. "
+            "Refusing to parse this XML file and add the log path there.",
+            MAX_RESULT_XML_SIZE_BYTES, result_xml_size)
+        return False
 
     # Basic JUnit XML format description:
     #
@@ -126,6 +136,12 @@ def main():
     with open(args.result_xml, 'w') as output_file:
         output_file.write(output_xml_str)
 
+    return True
+
 
 if __name__ == "__main__":
-    sys.exit(main())
+    if main():
+        exit_code = 0
+    else:
+        exit_code = 1
+    sys.exit(exit_code)
diff --git a/java/yb-client/src/test/java/org/yb/client/TestUtils.java b/java/yb-client/src/test/java/org/yb/client/TestUtils.java
@@ -402,6 +402,7 @@ public interface Condition {
   }
 
   public static void waitFor(Condition condition, long timeoutMs) throws Exception {
+    timeoutMs *= getTimeoutMultiplier();
     final long startTimeMs = System.currentTimeMillis();
     while (System.currentTimeMillis() - startTimeMs < timeoutMs && !condition.get()) {
       Thread.sleep(SLEEP_TIME_MS);