From 6d6c388caa7eeb24d1b125828683ac602815f7d0 Mon Sep 17 00:00:00 2001
From: Unai Sarasola <pianista215@users.noreply.github.com>
Date: Mon, 12 Mar 2018 12:06:40 +0100
Subject: [PATCH] [SPK-278] Fault tolerance dispatcher dropsdown (#176)

* Renamed pnf to performance

* Working fault tolerance 1

* Renamed for Pedro suggestions, and removing driver

* Incompleted. I must first solved the bug in the installation
---
 .../fault/FT_SSD_001_ExecutorDropDown_IT.java |  2 +-
 .../FT_SSD_002_DispatcherDropDown_IT.java     | 21 +++++
 .../pf/dispatcherAT/installation.feature      | 14 ++-
 .../pnf/fault/dispatcher-dropsdown.feature    | 94 +++++++++++++++++++
 testsAT/testng-fault.xml                      |  1 +
 5 files changed, 126 insertions(+), 6 deletions(-)
 create mode 100644 testsAT/src/test/java/com/stratio/pnf/fault/FT_SSD_002_DispatcherDropDown_IT.java
 create mode 100644 testsAT/src/test/resources/features/pnf/fault/dispatcher-dropsdown.feature

diff --git a/testsAT/src/test/java/com/stratio/pnf/fault/FT_SSD_001_ExecutorDropDown_IT.java b/testsAT/src/test/java/com/stratio/pnf/fault/FT_SSD_001_ExecutorDropDown_IT.java
index 3b4b5fd7b0e85..dcb53927587a5 100644
--- a/testsAT/src/test/java/com/stratio/pnf/fault/FT_SSD_001_ExecutorDropDown_IT.java
+++ b/testsAT/src/test/java/com/stratio/pnf/fault/FT_SSD_001_ExecutorDropDown_IT.java
@@ -15,7 +15,7 @@ public FT_SSD_001_ExecutorDropDown_IT() {
     }
 
     @Test(enabled = true, groups = {"FT_SSD_001_ExecutorDropDown"})
-    public void kafkaCoverage() throws Exception {
+    public void executorDropDownCoverage() throws Exception {
         new CucumberRunner(this.getClass()).runCukes();
     }
 }
diff --git a/testsAT/src/test/java/com/stratio/pnf/fault/FT_SSD_002_DispatcherDropDown_IT.java b/testsAT/src/test/java/com/stratio/pnf/fault/FT_SSD_002_DispatcherDropDown_IT.java
new file mode 100644
index 0000000000000..0a76fa480587b
--- /dev/null
+++ b/testsAT/src/test/java/com/stratio/pnf/fault/FT_SSD_002_DispatcherDropDown_IT.java
@@ -0,0 +1,21 @@
+
+package com.stratio.pnf.fault;
+
+import com.stratio.qa.cucumber.testng.CucumberRunner;
+import com.stratio.spark.tests.utils.BaseTest;
+import cucumber.api.CucumberOptions;
+import org.testng.annotations.Test;
+
+@CucumberOptions(features = {
+        "src/test/resources/features/pnf/fault/dispatcher-dropsdown.feature"
+})
+public class FT_SSD_002_DispatcherDropDown_IT extends BaseTest {
+
+    public FT_SSD_002_DispatcherDropDown_IT() {
+    }
+
+    @Test(enabled = true, groups = {"FT_SSD_002_DispatcherDropDown"})
+    public void dispatcherDropDownCoverage() throws Exception {
+        new CucumberRunner(this.getClass()).runCukes();
+    }
+}
diff --git a/testsAT/src/test/resources/features/pf/dispatcherAT/installation.feature b/testsAT/src/test/resources/features/pf/dispatcherAT/installation.feature
index c5003715f7781..5e94e63a59a0c 100644
--- a/testsAT/src/test/resources/features/pf/dispatcherAT/installation.feature
+++ b/testsAT/src/test/resources/features/pf/dispatcherAT/installation.feature
@@ -8,18 +8,22 @@ Feature: [Install Spark Dispatcher] Installing Spark Dispatcher
     Given I create file 'SparkDispatcherInstallation.json' based on 'schemas/pf/SparkDispatcher/BasicSparkDispatcher.json' as 'json' with:
       | $.service.name | UPDATE | ${SPARK_FW_NAME} | n/a |
 
-        #Copy DEPLOY JSON to DCOS-CLI
+    #Copy DEPLOY JSON to DCOS-CLI
     When I outbound copy 'target/test-classes/SparkDispatcherInstallation.json' through a ssh connection to '/dcos'
-        #Start image from JSON
+
+    #Start image from JSON
     And I run 'dcos package describe --app --options=/dcos/SparkDispatcherInstallation.json spark-dispatcher > /dcos/SparkDispatcherInstallationMarathon.json' in the ssh connection
     And I run 'sed -i -e 's|"image":.*|"image": "${SPARK_DOCKER_IMAGE}:${STRATIO_SPARK_VERSION}",|g' /dcos/SparkDispatcherInstallationMarathon.json' in the ssh connection
     And I run 'dcos marathon app add /dcos/SparkDispatcherInstallationMarathon.json' in the ssh connection
-        #Check Spark-fw is Running
+
+    #Check Spark-fw is Running
     Then in less than '500' seconds, checking each '20' seconds, the command output 'dcos task | grep "${SPARK_FW_NAME}\." | grep R | wc -l' contains '1'
-        #Find task-id if from DCOS-CLI
+
+    #Find task-id if from DCOS-CLI
     And in less than '300' seconds, checking each '20' seconds, the command output 'dcos marathon task list ${SPARK_FW_NAME} | grep ${SPARK_FW_NAME} | awk '{print $2}'' contains 'True'
     And I run 'dcos marathon task list ${SPARK_FW_NAME} | awk '{print $5}' | grep ${SPARK_FW_NAME} | head -n 1' in the ssh connection and save the value in environment variable 'sparkTaskId'
-        #DCOS dcos marathon task show check healtcheck status
+
+    #DCOS dcos marathon task show check healtcheck status
     Then in less than '300' seconds, checking each '10' seconds, the command output 'dcos marathon task show !{sparkTaskId} | grep TASK_RUNNING | wc -l' contains '1'
     Then in less than '300' seconds, checking each '10' seconds, the command output 'dcos marathon task show !{sparkTaskId} | grep healthCheckResults | wc -l' contains '1'
     Then in less than '300' seconds, checking each '10' seconds, the command output 'dcos marathon task show !{sparkTaskId} | grep  '"alive": true' | wc -l' contains '1'
\ No newline at end of file
diff --git a/testsAT/src/test/resources/features/pnf/fault/dispatcher-dropsdown.feature b/testsAT/src/test/resources/features/pnf/fault/dispatcher-dropsdown.feature
new file mode 100644
index 0000000000000..65b4603498a54
--- /dev/null
+++ b/testsAT/src/test/resources/features/pnf/fault/dispatcher-dropsdown.feature
@@ -0,0 +1,94 @@
+@rest
+Feature: [Stability test] Dispatcher Dropdowns
+
+  Background:
+
+    Given I open a ssh connection to '${DCOS_CLI_HOST}' with user 'root' and password 'stratio'
+    #Check dispatcher and spark-coverage are deployed
+    Then in less than '20' seconds, checking each '10' seconds, the command output 'dcos task | grep "${SPARK_FW_NAME}\." | grep R | wc -l' contains '1'
+    Then in less than '20' seconds, checking each '10' seconds, the command output 'dcos task | grep spark-coverage | grep R | wc -l' contains '1'
+
+    #Obtain mesos master
+    Given I open a ssh connection to '${DCOS_IP}' with user 'root' and password 'stratio'
+    Given I run 'getent hosts leader.mesos | awk '{print $1}'' in the ssh connection and save the value in environment variable 'MESOS_MASTER'
+
+    #Clean all drivers from spark-dispatcher
+    Then I clean all the drivers in the dispatcher with name '${SPARK_FW_NAME}' in dcos host '${CLUSTER_ID}.labs.stratio.com' with mesos master '!{MESOS_MASTER}:5050' with user 'admin' and password '1234'
+
+  Scenario:[Dispatcher Dropdowns][01] Launch Kafka job, delete dispatcher, and check if we launch a new dispatcher, retrieve the running drivers
+
+    #Now launch the work
+    Given I set sso token using host '${CLUSTER_ID}.labs.stratio.com' with user 'admin' and password '1234'
+    And I securely send requests to '${CLUSTER_ID}.labs.stratio.com:443'
+
+    When I send a 'POST' request to '/service/${SPARK_FW_NAME}/v1/submissions/create' based on 'schemas/pf/SparkCoverage/kafka_curl.json' as 'json' with:
+      |   $.appResource  |  UPDATE  | http://spark-coverage.marathon.mesos:9000/jobs/kafka-${COVERAGE_VERSION}.jar | n/a     |
+      |   $.sparkProperties['spark.jars']  |  UPDATE  | http://spark-coverage.marathon.mesos:9000/jobs/kafka-${COVERAGE_VERSION}.jar | n/a     |
+      |   $.sparkProperties['spark.mesos.executor.docker.image']  |  UPDATE  | ${SPARK_DOCKER_IMAGE}:${STRATIO_SPARK_VERSION} | n/a     |
+      |   $.appArgs[0]  |  UPDATE  | gosec1.node.paas.labs.stratio.com:9092 | n/a     |
+
+    Then the service response status must be '200' and its response must contain the text '"success" : true'
+
+    #Save the driver launched id
+    Then I save the value from field in service response 'submissionId' in variable 'driverKafka'
+
+    #Check the driver starts
+    Given I open a ssh connection to '${DCOS_CLI_HOST}' with user 'root' and password 'stratio'
+    Then in less than '200' seconds, checking each '10' seconds, the command output 'dcos task log !{driverKafka} stdout --lines=1000 | grep "###"' contains '###'
+
+    #Now kill the running dispatcher
+    Then I open a ssh connection to '${DCOS_CLI_HOST}' with user 'root' and password 'stratio'
+    Then I run 'dcos marathon app remove ${SPARK_FW_NAME}' in the ssh connection
+
+    #Check the dispatcher has been killed
+    Then in less than '150' seconds, checking each '10' seconds, the command output 'dcos task | grep "${SPARK_FW_NAME}\." | wc -l' contains '0'
+    Then in less than '150' seconds, checking each '10' seconds, the command output 'dcos marathon task list ${SPARK_FW_NAME} | wc -l' contains '0'
+
+    #Now we deploy another dispatcher (Copied from installation.feature... TODO Check a way better)
+    Given I create file 'SparkDispatcherInstallation.json' based on 'schemas/pf/SparkDispatcher/BasicSparkDispatcher.json' as 'json' with:
+      | $.service.name | UPDATE | ${SPARK_FW_NAME} | n/a |
+
+    #Copy DEPLOY JSON to DCOS-CLI
+    When I outbound copy 'target/test-classes/SparkDispatcherInstallation.json' through a ssh connection to '/dcos'
+
+    #Start image from JSON
+    And I run 'dcos package describe --app --options=/dcos/SparkDispatcherInstallation.json spark-dispatcher > /dcos/SparkDispatcherInstallationMarathon.json' in the ssh connection
+    And I run 'sed -i -e 's|"image":.*|"image": "${SPARK_DOCKER_IMAGE}:${STRATIO_SPARK_VERSION}",|g' /dcos/SparkDispatcherInstallationMarathon.json' in the ssh connection
+    And I run 'dcos marathon app add /dcos/SparkDispatcherInstallationMarathon.json' in the ssh connection
+
+    #Check Spark-fw is Running
+    Then in less than '500' seconds, checking each '20' seconds, the command output 'dcos task | grep "${SPARK_FW_NAME}\." | grep R | wc -l' contains '1'
+
+    #Find task-id if from DCOS-CLI
+    And in less than '300' seconds, checking each '20' seconds, the command output 'dcos marathon task list ${SPARK_FW_NAME} | grep ${SPARK_FW_NAME} | awk '{print $2}'' contains 'True'
+    And I run 'dcos marathon task list ${SPARK_FW_NAME} | awk '{print $5}' | grep ${SPARK_FW_NAME} | head -n 1' in the ssh connection and save the value in environment variable 'sparkTaskId'
+
+    #DCOS dcos marathon task show check healtcheck status
+    Then in less than '300' seconds, checking each '10' seconds, the command output 'dcos marathon task show !{sparkTaskId} | grep TASK_RUNNING | wc -l' contains '1'
+    Then in less than '300' seconds, checking each '10' seconds, the command output 'dcos marathon task show !{sparkTaskId} | grep healthCheckResults | wc -l' contains '1'
+    Then in less than '300' seconds, checking each '10' seconds, the command output 'dcos marathon task show !{sparkTaskId} | grep  '"alive": true' | wc -l' contains '1'
+
+
+    #Now check that the new dispatcher, has adopted the previous driver as child
+    Then in less than '200' seconds, checking each '10' seconds, the command output 'curl -s !{MESOS_MASTER}:5050/frameworks | jq '.frameworks[] | select(.name == "${SPARK_FW_NAME}") | .tasks |  map(select(.name | contains ("AT-kafka"))) | map(select(.id == "!{driverKafka}")) | .[] | .state' | grep "TASK_RUNNING" | wc -l' contains '1'
+
+    #Check the driver is working correctly, independent from dispatcher
+    Given I open a ssh connection to '${DCOS_CLI_HOST}' with user 'root' and password 'stratio'
+    Then I run 'dcos task log !{driverKafka} stdout --lines=1000000 | grep "###" | wc -l' in the ssh connection and save the value in environment variable 'PREVIOUS_WINDOW'
+    Then in less than '100' seconds, checking each '10' seconds, the command output 'if [ $(dcos task log !{driverKafka} stdout --lines=1000000 | grep "###" | wc -l) -gt "!{PREVIOUS_WINDOW}" ]; then echo "true"; fi' contains 'true'
+
+    #Now kill the kafka driver
+    #(We send a JSON because the step from cucumber, doesn't support empty posts submissions)
+    Then I set sso token using host '${CLUSTER_ID}.labs.stratio.com' with user 'admin' and password '1234'
+    Then I securely send requests to '${CLUSTER_ID}.labs.stratio.com:443'
+    Then I send a 'POST' request to '/service/${SPARK_FW_NAME}/v1/submissions/kill/!{driverKafka}' based on 'schemas/pf/SparkCoverage/kafka_curl.json' as 'json' with:
+      |   $.appResource  |  UPDATE  | http://spark-coverage.marathon.mesos:9000/jobs/kafka-${COVERAGE_VERSION}.jar | n/a     |
+
+    Then the service response status must be '200' and its response must contain the text '"success" : true'
+
+    #Check exit is clean
+    Then in less than '200' seconds, checking each '10' seconds, the command output 'curl -s !{MESOS_MASTER}:5050/frameworks | jq '.frameworks[] | select(.name == "${SPARK_FW_NAME}") | .completed_tasks |  map(select(.name | contains ("AT-kafka"))) | map(select(.id == "!{driverKafka}")) | .[] | .state' | grep "TASK_KILLED" | wc -l' contains '1'
+
+    Then in less than '10' seconds, checking each '5' seconds, the command output 'curl -s !{MESOS_MASTER}:5050/frameworks | jq '.frameworks[] | select(.name == "${SPARK_FW_NAME}") | .completed_tasks |  map(select(.name | contains ("AT-kafka"))) | map(select(.id == "!{driverKafka}")) | .[] | .statuses' | grep "TASK_RUNNING"  | wc -l' contains '1'
+    Then in less than '10' seconds, checking each '5' seconds, the command output 'curl -s !{MESOS_MASTER}:5050/frameworks | jq '.frameworks[] | select(.name == "${SPARK_FW_NAME}") | .completed_tasks |  map(select(.name | contains ("AT-kafka"))) | map(select(.id == "!{driverKafka}")) | .[] | .statuses' | grep "TASK_FAILED"  | wc -l' contains '0'
+    Then in less than '10' seconds, checking each '5' seconds, the command output 'curl -s !{MESOS_MASTER}:5050/frameworks | jq '.frameworks[] | select(.name == "${SPARK_FW_NAME}") | .completed_tasks |  map(select(.name | contains ("AT-kafka"))) | map(select(.id == "!{driverKafka}")) | .[] | .statuses' | grep "TASK_KILLED"  | wc -l' contains '1'
\ No newline at end of file
diff --git a/testsAT/testng-fault.xml b/testsAT/testng-fault.xml
index 511edc69959b8..0e22f21a897c0 100644
--- a/testsAT/testng-fault.xml
+++ b/testsAT/testng-fault.xml
@@ -7,6 +7,7 @@
         <groups>
             <run>
                 <include name="FT_SSD_001_ExecutorDropDown"/>
+                <include name="FT_SSD_002_DispatcherDropDown"/>
             </run>
         </groups>