Skip to content

Commit

Permalink
Merge pull request hail-is#31 from Nealelab/liam-add-custom-python-pa…
Browse files Browse the repository at this point in the history
…ckages

removed Anaconda from notebook init; added --pkgs option to cluster s…
  • Loading branch information
Liam Abbott authored Oct 2, 2017
2 parents f9d776d + 6e4e06e commit 30d1f76
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 166 deletions.
16 changes: 13 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,20 @@ from hail import *
hc = HailContext()
...
```
To read or write files stored in a Google bucket outside of Hail-specific commands, use Hail's `hadoop_read()` and `hadoop_write()` helper functions. For example, to read in a TSV file from Google storage to a nested Python list:
To read or write files stored in a Google bucket outside of Hail-specific commands, use Hail's `hadoop_read()` and `hadoop_write()` helper functions. For example, to read in a TSV file from Google storage to a pandas dataframe:
```
from hail import *
import pandas as pd
hc = HailContext()
with hadoop_read('gs://mybucket/mydata.tsv') as f:
rows = [x.strip().split('\t') for x in f.readlines()]
df = pd.read_table(f)
```

where pandas was included as a package to be installed in the cluster start command:
```
cluster start testcluster --pkgs pandas
```

When you save your notebooks using either `File -> Save and Checkpoint` or `command + s`, they'll be saved automatically to the bucket you're working in.
Expand Down Expand Up @@ -158,7 +164,8 @@ usage: cluster start [-h] [--hash HASH] [--spark {2.0.2,2.1.0}]
[--worker-boot-disk-size WORKER_BOOT_DISK_SIZE]
[--worker-machine-type WORKER_MACHINE_TYPE] [--zone ZONE]
[--properties PROPERTIES] [--metadata METADATA]
[--jar JAR] [--zip ZIP] [--init INIT] [--vep]
[--packages PACKAGES] [--jar JAR] [--zip ZIP]
[--init INIT] [--vep]
name
Start a Dataproc cluster configured for Hail.
Expand Down Expand Up @@ -201,6 +208,9 @@ optional arguments:
Additional configuration properties for the cluster
--metadata METADATA Comma-separated list of metadata to add:
KEY1=VALUE1,KEY2=VALUE2...
--packages PACKAGES, --pkgs PACKAGES
Comma-separated list of Python packages to be
installed on the master node.
--jar JAR Hail jar to use for Jupyter notebook.
--zip ZIP Hail zip to use for Jupyter notebook.
--init INIT Comma-separated list of init scripts to run.
Expand Down
290 changes: 134 additions & 156 deletions cloudtools/init_notebook.py
Original file line number Diff line number Diff line change
@@ -1,165 +1,143 @@
#!/usr/bin/env python

#!/usr/bin/python
import os
import json
import time
from subprocess import call, check_output
from subprocess import check_output, call

# get role of machine (master or worker)
role = check_output(['/usr/share/google/get_metadata_value', 'attributes/dataproc-role'])

# initialization actions to perform on master machine only
if role == 'Master':

# download Anaconda Python 2.7 installation script
call(['wget', '-P', '/home/anaconda2/', 'https://repo.continuum.io/archive/Anaconda2-4.3.1-Linux-x86_64.sh'])

# install Anaconda in /home/anaconda2/
call(['bash', '/home/anaconda2/Anaconda2-4.3.1-Linux-x86_64.sh', '-b', '-f', '-p', '/home/anaconda2/'])
os.chmod('/home/anaconda2/', 0777)

# additional packages to install
pkgs = [
'lxml',
'jupyter-spark',
'jgscm'
]

# use pip to install packages
for pkg in pkgs:
call(['/home/anaconda2/bin/pip', 'install', pkg])

# get Hail hash and Spark version to use for Jupyter notebook, if set through cluster startup metadata
spark = check_output(['/usr/share/google/get_metadata_value', 'attributes/SPARK'])
hail_version = check_output(['/usr/share/google/get_metadata_value', 'attributes/HAIL_VERSION'])
hash_name = check_output(['/usr/share/google/get_metadata_value', 'attributes/HASH'])

# default to Spark 2.0.2 if not otherwise specified through metadata
if not spark:
spark = '2.0.2'

# default to version 0.1
if not hail_version:
hail_version = '0.1'

# default to latest Hail build if none specified through metadata
if not hash_name:
hash_name = check_output(['gsutil', 'cat', 'gs://hail-common/builds/{0}/latest-hash-spark-{1}.txt'.format(hail_version, spark)]).strip()

# Hail jar
try:
custom_jar = check_output(['/usr/share/google/get_metadata_value', 'attributes/JAR'])
except:
hail_jar = 'hail-{0}-{1}-Spark-{2}.jar'.format(hail_version, hash_name, spark)
jar_path = 'gs://hail-common/builds/{0}/jars/{1}'.format(hail_version, hail_jar)
else:
hail_jar = custom_jar.rsplit('/')[-1]
jar_path = custom_jar

# Hail zip
try:
custom_zip = check_output(['/usr/share/google/get_metadata_value', 'attributes/ZIP'])
except:
hail_zip = 'hail-{0}-{1}.zip'.format(hail_version, hash_name)
zip_path = 'gs://hail-common/builds/{0}/python/{1}'.format(hail_version, hail_zip)
else:
hail_zip = custom_zip.rsplit('/')[-1]
zip_path = custom_zip

# make directory for Hail and Jupyter notebook related files
if not os.path.isdir('/home/hail/'):
os.mkdir('/home/hail/')
os.chmod('/home/hail/', 0777)

# copy Hail jar and zip to local directory on master node
call(['gsutil', 'cp', jar_path, '/home/hail/'])
call(['gsutil', 'cp', zip_path, '/home/hail/'])

# create Jupyter kernel spec file
kernel = {
'argv': [
'/home/anaconda2/bin/python',
'-m',
'ipykernel',
'-f',
'{connection_file}'
],
'display_name': 'Hail',
'language': 'python',
'env': {
'PYTHONHASHSEED': '0',
'SPARK_HOME': '/usr/lib/spark/',
'SPARK_CONF_DIR': '/home/hail/conf/',
'PYTHONPATH': '/usr/lib/spark/python/:/usr/lib/spark/python/lib/py4j-0.10.3-src.zip:/home/hail/{}'.format(hail_zip)
}
}

# write kernel spec file to default Jupyter kernel directory
os.makedirs('/home/anaconda2/share/jupyter/kernels/hail/')
with open('/home/anaconda2/share/jupyter/kernels/hail/kernel.json', 'wb') as f:
json.dump(kernel, f)

# make directory for custom Spark conf
os.mkdir('/home/hail/conf')

# copy conf files to custom directory
call(['cp', '/etc/spark/conf/spark-defaults.conf', '/home/hail/conf/spark-defaults.conf'])
call(['cp', '/etc/spark/conf/spark-env.sh', '/home/hail/conf/spark-env.sh'])

# modify custom Spark conf file to reference Hail jar and zip
with open('/home/hail/conf/spark-defaults.conf', 'ab') as f:
opts = [
'spark.files=/home/hail/{}'.format(hail_jar),
'spark.submit.pyFiles=/home/hail/{}'.format(hail_zip),
'spark.driver.extraClassPath=./{}'.format(hail_jar),
'spark.executor.extraClassPath=./{}'.format(hail_jar)
]
f.write('\n'.join(opts))

# add Spark variable designating Anaconda Python executable as the default on driver, in both custom and default conf files
with open('/home/hail/conf/spark-env.sh', 'ab') as f_custom, open('/etc/spark/conf/spark-env.sh', 'ab') as f_default:
f_custom.write('PYSPARK_DRIVER_PYTHON=/home/anaconda2/bin/python' + '\n')
f_default.write('PYSPARK_DRIVER_PYTHON=/home/anaconda2/bin/python' + '\n')

# create Jupyter configuration file
call(['mkdir', '-p', '/home/anaconda2/etc/jupyter/'])
with open('/home/anaconda2/etc/jupyter/jupyter_notebook_config.py', 'wb') as f:
opts = [
'c.Application.log_level = "DEBUG"',
'c.NotebookApp.ip = "127.0.0.1"',
'c.NotebookApp.open_browser = False',
'c.NotebookApp.port = 8123',
'c.NotebookApp.token = ""',
'c.NotebookApp.contents_manager_class = "jgscm.GoogleStorageContentManager"'
]
f.write('\n'.join(opts) + '\n')

# setup jupyter-spark extension
call(['/home/anaconda2/bin/jupyter', 'serverextension', 'enable', '--user', '--py', 'jupyter_spark'])
call(['/home/anaconda2/bin/jupyter', 'nbextension', 'install', '--user', '--py', 'jupyter_spark'])
call(['/home/anaconda2/bin/jupyter', 'nbextension', 'enable', '--user', '--py', 'jupyter_spark'])
call(['/home/anaconda2/bin/jupyter', 'nbextension', 'enable', '--user', '--py', 'widgetsnbextension'])

# create systemd service file for Jupyter notebook server process
with open('/lib/systemd/system/jupyter.service', 'wb') as f:
opts = [
'[Unit]',
'Description=Jupyter Notebook',
'After=hadoop-yarn-resourcemanager.service',
'[Service]',
'Type=simple',
'User=root',
'Group=root',
'WorkingDirectory=/home/hail/',
'ExecStart=/home/anaconda2/bin/python /home/anaconda2/bin/jupyter notebook',
'Restart=always',
'RestartSec=1',
'[Install]',
'WantedBy=multi-user.target'
]
f.write('\n'.join(opts) + '\n')

# add Jupyter service to autorun and start it
call(['systemctl', 'daemon-reload'])
call(['systemctl', 'enable', 'jupyter'])
call(['service', 'jupyter', 'start'])
# install pip
call(['apt-get', 'update'])
call(['apt-get', 'install', '-y', 'python-dev'])
call(['apt-get', 'install', '-y', 'python-pip'])
call(['pip', 'install', '--upgrade', 'pip'])

# additional packages to install
pkgs = [
'decorator',
'jupyter',
'lxml',
'jupyter-spark',
'jgscm'
]

# add user-requested packages
try:
user_pkgs = check_output(['/usr/share/google/get_metadata_value', 'attributes/PKGS'])
except:
pass
else:
pkgs.extend(user_pkgs.split(','))

# use pip to install packages
for pkg in pkgs:
call(['pip', 'install', '--upgrade', pkg])

# get Hail hash and Spark version to use for Jupyter notebook, if set through cluster startup metadata
spark = check_output(['/usr/share/google/get_metadata_value', 'attributes/SPARK'])
hail_version = check_output(['/usr/share/google/get_metadata_value', 'attributes/HAIL_VERSION'])
hash_name = check_output(['/usr/share/google/get_metadata_value', 'attributes/HASH'])

# Hail jar
try:
custom_jar = check_output(['/usr/share/google/get_metadata_value', 'attributes/JAR'])
except:
hail_jar = 'hail-{0}-{1}-Spark-{2}.jar'.format(hail_version, hash_name, spark)
jar_path = 'gs://hail-common/builds/{0}/jars/{1}'.format(hail_version, hail_jar)
else:
hail_jar = custom_jar.rsplit('/')[-1]
jar_path = custom_jar

# Hail zip
try:
custom_zip = check_output(['/usr/share/google/get_metadata_value', 'attributes/ZIP'])
except:
hail_zip = 'hail-{0}-{1}.zip'.format(hail_version, hash_name)
zip_path = 'gs://hail-common/builds/{0}/python/{1}'.format(hail_version, hail_zip)
else:
hail_zip = custom_zip.rsplit('/')[-1]
zip_path = custom_zip

# copy Hail jar and zip to local directory on master node
call(['gsutil', 'cp', jar_path, '/usr/lib/spark/jars/'])
call(['gsutil', 'cp', zip_path, '/usr/lib/spark/python/'])

# modify custom Spark conf file to reference Hail jar and zip
with open('/etc/spark/conf/spark-defaults.conf', 'a') as f:
opts = [
'spark.files=/usr/lib/spark/jars/{}'.format(hail_jar),
'spark.submit.pyFiles=/usr/lib/spark/python/{}'.format(hail_zip),
'spark.driver.extraClassPath=./{}'.format(hail_jar),
'spark.executor.extraClassPath=./{}'.format(hail_jar)
]
f.write('\n'.join(opts))

# create Jupyter kernel spec file
kernel = {
'argv': [
'/usr/bin/python',
'-m',
'ipykernel',
'-f',
'{connection_file}'
],
'display_name': 'Hail',
'language': 'python',
'env': {
'PYTHONHASHSEED': '0',
'SPARK_HOME': '/usr/lib/spark/',
'SPARK_CONF_DIR': '/etc/spark/conf/',
'PYTHONPATH': '/usr/lib/spark/python/:/usr/lib/spark/python/lib/py4j-0.10.3-src.zip:/usr/lib/spark/python/{}'.format(hail_zip)
}
}

# write kernel spec file to default Jupyter kernel directory
os.mkdir('/usr/local/share/jupyter/kernels/hail/')
with open('/usr/local/share/jupyter/kernels/hail/kernel.json', 'w') as f:
json.dump(kernel, f)

# create Jupyter configuration file
os.mkdir('/usr/local/etc/jupyter/')
with open('/usr/local/etc/jupyter/jupyter_notebook_config.py', 'w') as f:
opts = [
'c.Application.log_level = "DEBUG"',
'c.NotebookApp.ip = "127.0.0.1"',
'c.NotebookApp.open_browser = False',
'c.NotebookApp.port = 8123',
'c.NotebookApp.token = ""',
'c.NotebookApp.contents_manager_class = "jgscm.GoogleStorageContentManager"'
]
f.write('\n'.join(opts) + '\n')

# setup jupyter-spark extension
call(['/usr/local/bin/jupyter', 'serverextension', 'enable', '--user', '--py', 'jupyter_spark'])
call(['/usr/local/bin/jupyter', 'nbextension', 'install', '--user', '--py', 'jupyter_spark'])
call(['/usr/local/bin/jupyter', 'nbextension', 'enable', '--user', '--py', 'jupyter_spark'])
call(['/usr/local/bin/jupyter', 'nbextension', 'enable', '--user', '--py', 'widgetsnbextension'])

# create systemd service file for Jupyter notebook server process
with open('/lib/systemd/system/jupyter.service', 'w') as f:
opts = [
'[Unit]',
'Description=Jupyter Notebook',
'After=hadoop-yarn-resourcemanager.service',
'[Service]',
'Type=simple',
'User=root',
'Group=root',
'WorkingDirectory=/usr/local/',
'ExecStart=/usr/bin/python /usr/local/bin/jupyter notebook --allow-root',
'Restart=always',
'RestartSec=1',
'[Install]',
'WantedBy=multi-user.target'
]
f.write('\n'.join(opts) + '\n')

# add Jupyter service to autorun and start it
call(['systemctl', 'daemon-reload'])
call(['systemctl', 'enable', 'jupyter'])
call(['service', 'jupyter', 'start'])
16 changes: 12 additions & 4 deletions cloudtools/start.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def init_parser(parser):
help='Additional configuration properties for the cluster')
parser.add_argument('--metadata', default='',
help='Comma-separated list of metadata to add: KEY1=VALUE1,KEY2=VALUE2...')
parser.add_argument('--packages', '--pkgs', default='',
help='Comma-separated list of Python packages to be installed on the master node.')

# specify custom Hail jar and zip
parser.add_argument('--jar', help='Hail jar to use for Jupyter notebook.')
Expand Down Expand Up @@ -102,13 +104,15 @@ def main(args):
# default initialization script to start up cluster with
init_actions = 'gs://hail-common/init_notebook-{}.py'.format(COMPATIBILITY_VERSION)

if args.init:
init_actions += ',' + args.init

# add VEP action
# add VEP init script
if args.vep:
init_actions += ',' + 'gs://hail-common/vep/vep/vep85-init.sh'

# add custom init scripts
if args.init:
init_actions += ',' + args.init

# get Hail build (default to latest)
if args.hash == 'latest':
hail_hash = check_output(['gsutil', 'cat', 'gs://hail-common/builds/{0}/latest-hash-spark-{1}.txt'.format(args.version, args.spark)]).strip()
else:
Expand All @@ -125,6 +129,10 @@ def main(args):
if args.zip:
metadata += ',ZIP={}'.format(args.zip)

# if Python packages requested, add metadata variable
if args.packages:
metadata = '^:^' + metadata.replace(',', ':') + ':PKGS={}'.format(args.packages)

# command to start cluster
cmd = [
'gcloud', 'dataproc', 'clusters', 'create',
Expand Down
Loading

0 comments on commit 30d1f76

Please sign in to comment.