Skip to content

Commit

Permalink
#5703: #5791: #5792 [YW] Releasing OnPrem nodes should not delete the…
Browse files Browse the repository at this point in the history
…m from universe metadata

Summary:
1. This diff aligns the OnPrem release node + re-add node flow to VM universes - releasing a node should
not clear the universe metaedata for that node. To re-add the node, use the "Node -> Actions -> Add"
flow instead of the "Edit Universe" flow.

2. Fixes bug introduced in D8405

3. Fixes bug introduced in https://phabricator.dev.yugabyte.com/D9097 - #5791 - releasing a node instance should only attempt to shut down processes on that node.

4. During an add node, we were not running the selection logic in NodeManager::pickNodes to select nodes from the onprem provider.

Test Plan:
1. Create an onprem universe, release node, verify that an entry still remains in the universe nodes page. Verify that the node shows up as not in use on the provider node instances page. Verify that data is deleted from a node when it is released.

2. Verify that a newly created universe can pick up a released node.

3. Verify that a newly created universe can pick up a combination of new nodes and released nodes.

Reviewers: arnav, daniel, ram, wesley

Reviewed By: ram

Subscribers: kannan, jenkins-bot, yugaware

Differential Revision: https://phabricator.dev.yugabyte.com/D8825
  • Loading branch information
WesleyW authored and iSignal committed Sep 23, 2020
1 parent 1cd68df commit fcb3607
Show file tree
Hide file tree
Showing 9 changed files with 76 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ clean_data_paths() {
for (( i=0; i<NUM_MOUNTS; ++i ))
do
rm -rf "${MOUNT_PATHS[i]}"/yb-data/$daemon
rm -rf "${MOUNT_PATHS[i]}"/pg_data
done

print_err_out "Cleaning core files on `hostname`"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,28 @@

package com.yugabyte.yw.commissioner.tasks;

import com.yugabyte.yw.commissioner.Common.CloudType;
import com.yugabyte.yw.commissioner.SubTaskGroupQueue;
import com.yugabyte.yw.commissioner.UserTaskDetails.SubTaskGroupType;
import com.yugabyte.yw.commissioner.tasks.params.NodeTaskParams;
import com.yugabyte.yw.common.DnsManager;
import com.yugabyte.yw.forms.UniverseDefinitionTaskParams;
import com.yugabyte.yw.forms.UniverseDefinitionTaskParams.Cluster;
import com.yugabyte.yw.forms.UniverseDefinitionTaskParams.UserIntent;
import com.yugabyte.yw.models.NodeInstance;
import com.yugabyte.yw.models.Universe;
import com.yugabyte.yw.models.helpers.NodeDetails;
import com.yugabyte.yw.models.helpers.NodeDetails.NodeState;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -73,11 +80,25 @@ public void run() {
createSetNodeStateTask(currentNode, NodeState.Adding)
.setSubTaskGroupType(SubTaskGroupType.StartingNode);

Cluster cluster = taskParams().getClusterByUuid(currentNode.placementUuid);
Collection<NodeDetails> node = new HashSet<NodeDetails>(Arrays.asList(currentNode));

// First spawn an instance for Decommissioned node.
boolean wasDecommissioned = currentNode.state == NodeState.Decommissioned;
if (wasDecommissioned) {
if (cluster.userIntent.providerType.equals(CloudType.onprem)) {
// For onprem universes, allocate an available node
// from the provider's node_instance table.
Map<UUID, List<String>> onpremAzToNodes = new HashMap<UUID, List<String>>();
List<String> nodeNameList = new ArrayList<>();
nodeNameList.add(currentNode.nodeName);
onpremAzToNodes.put(currentNode.azUuid, nodeNameList);
String instanceType = currentNode.cloudInfo.instance_type;

Map<String, NodeInstance> nodeMap = NodeInstance.pickNodes(onpremAzToNodes, instanceType);
currentNode.nodeUuid = nodeMap.get(currentNode.nodeName).nodeUuid;
}

createSetupServerTasks(node)
.setSubTaskGroupType(SubTaskGroupType.Provisioning);

Expand All @@ -93,6 +114,10 @@ public void run() {
// Bring up any masters, as needed.
boolean masterAdded = false;
if (areMastersUnderReplicated(currentNode, universe)) {
LOG.info(
"Bringing up master for under replicated universe {} ({})",
universe.universeUUID, universe.name
);
// Set gflags for master.
createGFlagsOverrideTasks(node, ServerType.MASTER);

Expand All @@ -115,8 +140,6 @@ public void run() {
masterAdded = true;
}

Cluster cluster = taskParams().getClusterByUuid(currentNode.placementUuid);

UniverseDefinitionTaskParams universeDetails = universe.getUniverseDetails();

// Explicitly set webserver ports for each dql
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,11 @@ public void run() {
}

// Create tasks to destroy the existing nodes.
createDestroyServerTasks(universe.getNodes(), params().isForceDelete, true)
.setSubTaskGroupType(SubTaskGroupType.RemovingUnusedServers);
createDestroyServerTasks(
universe.getNodes(),
params().isForceDelete,
true /* delete node */
).setSubTaskGroupType(SubTaskGroupType.RemovingUnusedServers);
}

// Create tasks to remove the universe entry from the Universe table.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@
import com.yugabyte.yw.models.Universe;
import com.yugabyte.yw.models.helpers.NodeDetails;
import com.yugabyte.yw.models.helpers.NodeDetails.NodeState;
import com.yugabyte.yw.models.NodeInstance;

import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;

import org.slf4j.Logger;
Expand Down Expand Up @@ -85,30 +87,29 @@ public void run() {
createModifyBlackListTask(Arrays.asList(currentNode), false /* isAdd */)
.setSubTaskGroupType(SubTaskGroupType.ReleasingInstance);

if (instanceExists(taskParams())) {
if (userIntent.providerType.equals(CloudType.onprem)) {
boolean isOnprem = userIntent.providerType.equals(CloudType.onprem);
if (instanceExists(taskParams()) || isOnprem) {
Collection<NodeDetails> currentNodeDetails = new HashSet<>(Arrays.asList(currentNode));
if (isOnprem) {
// Stop master and tservers.
createStopServerTasks(universe.getNodes(), "master", false)
createStopServerTasks(currentNodeDetails, "master", true /* isForceDelete */)
.setSubTaskGroupType(SubTaskGroupType.StoppingNodeProcesses);
createStopServerTasks(universe.getNodes(), "tserver", false)
createStopServerTasks(currentNodeDetails, "tserver", true /* isForceDelete */)
.setSubTaskGroupType(SubTaskGroupType.StoppingNodeProcesses);
}

// Create tasks to terminate that instance. Force delete and ignore errors.
createDestroyServerTasks(new HashSet<NodeDetails>(Arrays.asList(currentNode)), true, false)
.setSubTaskGroupType(SubTaskGroupType.ReleasingInstance);
createDestroyServerTasks(
currentNodeDetails,
true /* isForceDelete */,
false /* deleteNode */
).setSubTaskGroupType(SubTaskGroupType.ReleasingInstance);
}

// Update Node State to Decommissioned.
createSetNodeStateTask(currentNode, NodeState.Decommissioned)
.setSubTaskGroupType(SubTaskGroupType.ReleasingInstance);

// Delete and reset node metadata for onprem universes.
if (userIntent.providerType.equals(CloudType.onprem)) {
deleteNodeFromUniverseTask(taskParams().nodeName)
.setSubTaskGroupType(SubTaskGroupType.ReleasingInstance);
}

// Update the DNS entry for this universe.
createDnsManipulationTask(DnsManager.DnsCommandType.Edit, false, userIntent.providerType,
userIntent.provider, userIntent.universeName)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,11 @@ public void setNodeNames(UniverseOpType opType, Universe universe) {
}

public void updateOnPremNodeUuids(Universe universe) {
LOG.debug("Update on prem nodes in universe {}.", taskParams().universeUUID);
LOG.info(
"Selecting prem nodes for universe {} ({}).",
universe.name,
taskParams().universeUUID
);

UniverseDefinitionTaskParams universeDetails = universe.getUniverseDetails();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ public void run() {
} catch (Exception e) {
if (!taskParams().isForceDelete) {
throw e;
} else {
LOG.debug("Ignoring error: {}", e.getMessage());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,27 +46,17 @@ private void removeNodeFromUniverse(final String nodeName) {
LOG.error("No node in universe with name " + nodeName);
return;
}
UserIntent userIntent = u.getUniverseDetails()
.getClusterByUuid(u.getNode(taskParams().nodeName).placementUuid).userIntent;
// Persist the desired node information into the DB.
UniverseUpdater updater = new UniverseUpdater() {
@Override
public void run(Universe universe) {
UniverseDefinitionTaskParams universeDetails = universe.getUniverseDetails();
universeDetails.removeNode(nodeName);
LOG.debug("Removing node " + nodeName + " from universe " + taskParams().universeUUID);
LOG.info("Removed node " + nodeName + " from universe " + taskParams().universeUUID);
}
};

Universe.saveDetails(taskParams().universeUUID, updater);

if (userIntent.providerType.equals(Common.CloudType.onprem)) {
// Free up the node.
NodeInstance node = NodeInstance.getByName(nodeName);
node.inUse = false;
node.setNodeName("");
node.save();
}
}

@Override
Expand All @@ -84,6 +74,25 @@ public void run() {
}
}

Universe u = Universe.get(taskParams().universeUUID);
UserIntent userIntent = u.getUniverseDetails()
.getClusterByUuid(u.getNode(taskParams().nodeName).placementUuid).userIntent;
NodeDetails univNodeDetails = u.getNode(taskParams().nodeName);

if (userIntent.providerType.equals(Common.CloudType.onprem) &&
univNodeDetails.state != NodeDetails.NodeState.Decommissioned) {
// Free up the node.
try {
NodeInstance providerNode = NodeInstance.getByName(taskParams().nodeName);
providerNode.clearNodeDetails();
} catch (Exception e) {
if (!taskParams().isForceDelete) {
throw e;
}
}
LOG.info("Marked node instance {} as available", taskParams().nodeName);
}

if (taskParams().deleteNode) {
// Update the node state to removed. Even though we remove the node below, this will
// help tracking state for any nodes stuck in limbo.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public void run(Universe universe) {
}
Field field;
try {
LOG.info("Node {}: setting field {} to value {}.",
LOG.info("Node {}: setting univ node details field {} to value {}.",
taskParams.nodeName, entry.getKey(), entry.getValue());
// Error out if the host was not found.
if (entry.getKey().equals("host_found") && entry.getValue().asText().equals("false")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ public static int deleteByProvider(UUID providerUUID) {
return deleteStmt.execute();
}

/** Pick available nodes in zones specified by onpremAzToNodes with
* with the instance type specified
*/
public static synchronized Map<String, NodeInstance> pickNodes(
Map<UUID, List<String>> onpremAzToNodes, String instanceTypeCode) {
Map<String, NodeInstance> outputMap = new HashMap<String, NodeInstance>();
Expand All @@ -134,6 +137,7 @@ public static synchronized Map<String, NodeInstance> pickNodes(
node.setNodeName(nodeName);
outputMap.put(nodeName, node);
++index;
LOG.info("Marking node {} (ip {}) as in-use.", nodeName, node.getDetails().ip);
}
}
// All good, save to DB.
Expand Down

0 comments on commit fcb3607

Please sign in to comment.