-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Automated GitHub Actions Test for gRPC Training (#148)
* added MPI Communication class * added send thread, merged 2 classes * improved comments * testing mpi, model weights not acquired * mpi works, occassional deadlock issue * merged send and listener threads * first draft of test * using python3.10 * made testing sys and algo configs * testing workflow * predict next move ish * moved quorum send * moved quorum send * using traditional fl algo * run test only during push to main * new dump_dir * remove send_status from proto * changed dump_dir * small changes
- Loading branch information
1 parent
5e960ab
commit cd614bb
Showing
13 changed files
with
417 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
name: Test Training Code with gRPC | ||
|
||
on: | ||
workflow_dispatch: | ||
push: | ||
branches: | ||
# - main | ||
- "*" | ||
pull_request: | ||
branches: | ||
- main | ||
|
||
env: | ||
ACTIONS_STEP_DEBUG: true | ||
|
||
jobs: | ||
train-check: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
# Step 1: Checkout the code | ||
- name: Checkout repository | ||
uses: actions/checkout@v3 | ||
|
||
# Step 2: Set up Python | ||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: "3.10" # Specify the Python version you're using | ||
|
||
# Step 3: Install dependencies | ||
- name: Install dependencies | ||
run: | | ||
sudo apt update | ||
sudo apt install -y libopenmpi-dev openmpi-bin | ||
sudo apt-get install -y libgl1 libglib2.0-0 | ||
pip install -r requirements.txt | ||
# Step 4: Run gRPC server and client | ||
- name: Run test | ||
run: | | ||
cd src | ||
# chmod +x ./configs/algo_config_test.py | ||
echo "starting main grpc" | ||
python main_grpc.py -n 4 -host localhost | ||
echo "starting main" | ||
python main.py -super true -s "./configs/sys_config_test.py" | ||
echo "done" | ||
# further checks: | ||
# only 5 rounds | ||
# gRPC only? or also MPI? | ||
# num of samples | ||
# num users and nodes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
{ | ||
"python.analysis.typeCheckingMode": "strict" | ||
} | ||
"python.analysis.typeCheckingMode": "strict" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from utils.types import ConfigType | ||
|
||
# fedstatic: ConfigType = { | ||
# # Collaboration setup | ||
# "algo": "fedstatic", | ||
# "topology": {"name": "watts_strogatz", "k": 3, "p": 0.2}, # type: ignore | ||
# "rounds": 1, | ||
|
||
# # Model parameters | ||
# "model": "resnet10", | ||
# "model_lr": 3e-4, | ||
# "batch_size": 256, | ||
# } | ||
|
||
traditional_fl: ConfigType = { | ||
# Collaboration setup | ||
"algo": "fedavg", | ||
"rounds": 1, | ||
|
||
# Model parameters | ||
"model": "resnet10", | ||
"model_lr": 3e-4, | ||
"batch_size": 256, | ||
} | ||
|
||
# default_config_list: List[ConfigType] = [fedstatic, fedstatic, fedstatic, fedstatic] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
from typing import Dict, List, Literal, Optional | ||
import random | ||
from utils.types import ConfigType | ||
|
||
from .algo_config_test import ( | ||
traditional_fl | ||
) | ||
|
||
def get_device_ids(num_users: int, gpus_available: List[int | Literal["cpu"]]) -> Dict[str, List[int | Literal["cpu"]]]: | ||
""" | ||
Get the GPU device IDs for the users. | ||
""" | ||
# TODO: Make it multi-host | ||
device_ids: Dict[str, List[int | Literal["cpu"]]] = {} | ||
for i in range(num_users + 1): # +1 for the super-node | ||
index = i % len(gpus_available) | ||
gpu_id = gpus_available[index] | ||
device_ids[f"node_{i}"] = [gpu_id] | ||
return device_ids | ||
|
||
|
||
def get_algo_configs( | ||
num_users: int, | ||
algo_configs: List[ConfigType], | ||
assignment_method: Literal[ | ||
"sequential", "random", "mapping", "distribution" | ||
] = "sequential", | ||
seed: Optional[int] = 1, | ||
mapping: Optional[List[int]] = None, | ||
distribution: Optional[Dict[int, int]] = None, | ||
) -> Dict[str, ConfigType]: | ||
""" | ||
Assign an algorithm configuration to each node, allowing for repetition. | ||
sequential: Assigns the algo_configs sequentially to the nodes | ||
random: Assigns the algo_configs randomly to the nodes | ||
mapping: Assigns the algo_configs based on the mapping of node index to algo index provided | ||
distribution: Assigns the algo_configs based on the distribution of algo index to number of nodes provided | ||
""" | ||
algo_config_map: Dict[str, ConfigType] = {} | ||
algo_config_map["node_0"] = algo_configs[0] # Super-node | ||
if assignment_method == "sequential": | ||
for i in range(1, num_users + 1): | ||
algo_config_map[f"node_{i}"] = algo_configs[i % len(algo_configs)] | ||
elif assignment_method == "random": | ||
for i in range(1, num_users + 1): | ||
algo_config_map[f"node_{i}"] = random.choice(algo_configs) | ||
elif assignment_method == "mapping": | ||
if not mapping: | ||
raise ValueError("Mapping must be provided for assignment method 'mapping'") | ||
assert len(mapping) == num_users | ||
for i in range(1, num_users + 1): | ||
algo_config_map[f"node_{i}"] = algo_configs[mapping[i - 1]] | ||
elif assignment_method == "distribution": | ||
if not distribution: | ||
raise ValueError( | ||
"Distribution must be provided for assignment method 'distribution'" | ||
) | ||
total_users = sum(distribution.values()) | ||
assert total_users == num_users | ||
|
||
# List of node indices to assign | ||
node_indices = list(range(1, total_users + 1)) | ||
# Seed for reproducibility | ||
random.seed(seed) | ||
# Shuffle the node indices based on the seed | ||
random.shuffle(node_indices) | ||
|
||
# Assign nodes based on the shuffled indices | ||
current_index = 0 | ||
for algo_index, num_nodes in distribution.items(): | ||
for i in range(num_nodes): | ||
node_id = node_indices[current_index] | ||
algo_config_map[f"node_{node_id}"] = algo_configs[algo_index] | ||
current_index += 1 | ||
else: | ||
raise ValueError(f"Invalid assignment method: {assignment_method}") | ||
# print("algo config mapping is: ", algo_config_map) | ||
return algo_config_map | ||
|
||
CIFAR10_DSET = "cifar10" | ||
CIAR10_DPATH = "./datasets/imgs/cifar10/" | ||
|
||
# DUMP_DIR = "../../../../../../../home/" | ||
DUMP_DIR = "/tmp/" | ||
|
||
NUM_COLLABORATORS = 1 | ||
num_users = 4 | ||
|
||
dropout_dict = { | ||
"distribution_dict": { # leave dict empty to disable dropout | ||
"method": "uniform", # "uniform", "normal" | ||
"parameters": {} # "mean": 0.5, "std": 0.1 in case of normal distribution | ||
}, | ||
"dropout_rate": 0.0, # cutoff for dropout: [0,1] | ||
"dropout_correlation": 0.0, # correlation between dropouts of successive rounds: [0,1] | ||
} | ||
|
||
dropout_dicts = {"node_0": {}} | ||
for i in range(1, num_users + 1): | ||
dropout_dicts[f"node_{i}"] = dropout_dict | ||
|
||
gpu_ids = [2, 3, 5, 6] | ||
|
||
grpc_system_config: ConfigType = { | ||
"exp_id": "static", | ||
"num_users": num_users, | ||
"num_collaborators": NUM_COLLABORATORS, | ||
"comm": {"type": "GRPC", "synchronous": True, "peer_ids": ["localhost:50048"]}, # The super-node | ||
"dset": CIFAR10_DSET, | ||
"dump_dir": DUMP_DIR, | ||
"dpath": CIAR10_DPATH, | ||
"seed": 2, | ||
"device_ids": get_device_ids(num_users, gpu_ids), | ||
# "algos": get_algo_configs(num_users=num_users, algo_configs=default_config_list), # type: ignore | ||
"algos": get_algo_configs(num_users=num_users, algo_configs=[traditional_fl]), # type: ignore | ||
# "samples_per_user": 50000 // num_users, # distributed equally | ||
"samples_per_user": 100, | ||
"train_label_distribution": "non_iid", | ||
"test_label_distribution": "iid", | ||
"alpha_data": 1.0, | ||
"exp_keys": [], | ||
"dropout_dicts": dropout_dicts, | ||
"test_samples_per_user": 200, | ||
} | ||
|
||
current_config = grpc_system_config |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -70,6 +70,5 @@ | |
# Start the scheduler | ||
scheduler.install_config() | ||
scheduler.initialize() | ||
|
||
# Run the job | ||
scheduler.run_job() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.