-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathemr_bootstrap.sh
76 lines (65 loc) · 4.46 KB
/
emr_bootstrap.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/bin/bash
set -x -e #https://explainshell.com/explain?cmd=set+-x+-e
#updates **Linux** Distribution
sudo yum -y update
#Default password set to password123 unless a cmd line argument in position 1 is specified.
JUPYTER_PASSWORD=${1:-"password123"}
#default port for jupyter notebook set to 8194 unless specified by cmd line argument in position 2
PORT=${2:-8194}
#Install miniconda - miniconda > anaconda as its way smaller. No memory issues will occur. As such avoids needless shifting of directories.
sudo wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh #https://explainshell.com/explain?cmd=sudo+wget+https%3A%2F%2Frepo.continuum.io%2Fminiconda%2FMiniconda3-latest-Linux-x86_64.sh+-O+~%2Fminiconda.sh
# Execute the created miniconda.sh script
bash ~/miniconda.sh -b -p $HOME/miniconda
echo -e '\nexport PATH=$HOME/miniconda/bin:$PATH' >> $HOME/.bashrc #need to update path so we can use conda in the command line.
source $HOME/.bashrc #Dont forget to source here - otherwise everything will break!
echo -e '\nexport SPARK_HOME=/usr/lib/spark' >> $HOME/.bashrc # spark lives here
echo -e "\nexport PATH=$PATH:$SPARK_HOME/bin" >> $HOME/.bashrc # need to append to path so that Jupyter and our command line know where to find and execute spark
echo -e '\nexport PYSPARK_PYTHON="/home/hadoop/miniconda/bin/python3.6"' >> $HOME/.bashrc # tells pyspark to run on python 3.6
source $HOME/.bashrc
#My preference of aliases
echo -e '\nalias c="clear"' >> $HOME/.bashrc
echo -e "\nalias py='ipython3'" >> $HOME/.bashrc
echo -e "\nalias ls='ls -hlpGF'" >> $HOME/.bashrc
source $HOME/.bashrc
conda update -y conda # update miniconda
rm ~/miniconda.sh # This removes the script we used to install miniconda.
conda install -y boto3 numpy pandas scipy scikit-learn statsmodels ipython # packages to be installed on every EC2 instance. Both the Master and the Slave Node.
# Recall that this script is run on BOTH the Master Node and the Slave nodes at THE SAME TIME.
## The code below will ONLY be run on the master node.
IS_MASTER=false
if grep isMaster /mnt/var/lib/info/instance.json | grep true; #This segment of code identifies if an EC2 instance is labeled as the Master.
then
IS_MASTER=true
sudo yum install -y git #install git on master node
echo "Installing Python libs for Master" # this echo is useful for investigating logs if bootstrap fails.
conda install -y jupyter matplotlib seaborn #install jupyter and viz packages on master. No need to be done on slaves.
jupyter notebook --generate-config #Jupyter config stuff below
HASHED_PASSWORD=$(python -c "from notebook.auth import passwd; print(passwd('$JUPYTER_PASSWORD'))")
sed -i '/c.NotebookApp.port/d' ~/.jupyter/jupyter_notebook_config.py
echo "c.NotebookApp.port = $PORT" >> ~/.jupyter/jupyter_notebook_config.py
sed -i '/c.NotebookApp.ip/d' ~/.jupyter/jupyter_notebook_config.py
echo "c.NotebookApp.ip = '*'" >> ~/.jupyter/jupyter_notebook_config.py
sed -i '/c.NotebookApp.open_browser/d' ~/.jupyter/jupyter_notebook_config.py
echo "c.NotebookApp.open_browser = False" >> ~/.jupyter/jupyter_notebook_config.py
sed -i '/c.NotebookApp.MultiKernelManager.default_kernel_name/d' ~/.jupyter/jupyter_notebook_config.py
echo "c.NotebookApp.MultiKernelManager.default_kernel_name = 'pyspark'" >> ~/.jupyter/jupyter_notebook_config.py #default kernel set to PySpark
sed -i '/c.NotebookApp.password/d' ~/.jupyter/jupyter_notebook_config.py
echo "c.NotebookApp.password = u'$HASHED_PASSWORD'" >> ~/.jupyter/jupyter_notebook_config.py
echo -e '\nexport PYSPARK_DRIVER_PYTHON=jupyter' >> $HOME/.bashrc
echo -e '\nexport PYSPARK_DRIVER_PYTHON_OPTS="notebook"' >> $HOME/.bashrc
echo -e '\nalias jn="jupyter notebook"' >> $HOME/.bashrc
echo -e '\nalias ps="pyspark"' >> $HOME/.bashrc
source $HOME/.bashrc
fi
############## AWS CLI EXECUTION - below is how you would execute a cluster launch using cli
# aws emr create-cluster --name "Spark Cluster" \
# --release-label emr-5.11.1 \
# --use-default-roles \
# --log-uri <insert s3 path here for your EMR logs to be stored> \
# --applications Name=Spark Name=Ganglia \
# --ec2-attributes \
# KeyName=<insert name of .pem key here>,SubnetId=<insert subnet ID here>,EmrManagedMasterSecurityGroup=<insert master node SG here>,EmrManagedSlaveSecurityGroup=<insert Slave Nodes SG here> \
# --instance-type=m4.large \
# --instance-count 3 \
# --region us-east-1 \
# --bootstrap-action Path=<insert s3 path to bootstrap here>,Args=[<insert bootstrap script cmd line args here>] \