Merge pull request hail-is#31 from Nealelab/liam-add-custom-python-pa…

…ckages removed Anaconda from notebook init; added --pkgs option to cluster s…
tpoterba · Oct 2, 2017 · 30d1f76 · 30d1f76
2 parents f9d776d + 6e4e06e
commit 30d1f76
Show file tree

Hide file tree

Showing 5 changed files with 175 additions and 166 deletions.
diff --git a/README.md b/README.md
@@ -91,14 +91,20 @@ from hail import *
 hc = HailContext()
 ...
 ```
-To read or write files stored in a Google bucket outside of Hail-specific commands, use Hail's `hadoop_read()` and `hadoop_write()` helper functions. For example, to read in a TSV file from Google storage to a nested Python list:
+To read or write files stored in a Google bucket outside of Hail-specific commands, use Hail's `hadoop_read()` and `hadoop_write()` helper functions. For example, to read in a TSV file from Google storage to a pandas dataframe:
 ```
 from hail import *
+import pandas as pd
 
 hc = HailContext()
 
 with hadoop_read('gs://mybucket/mydata.tsv') as f:
-    rows = [x.strip().split('\t') for x in f.readlines()]
+    df = pd.read_table(f)
+```
+
+where pandas was included as a package to be installed in the cluster start command:
+```
+cluster start testcluster --pkgs pandas
 ```
 
 When you save your notebooks using either `File -> Save and Checkpoint` or `command + s`, they'll be saved automatically to the bucket you're working in.
@@ -158,7 +164,8 @@ usage: cluster start [-h] [--hash HASH] [--spark {2.0.2,2.1.0}]
                      [--worker-boot-disk-size WORKER_BOOT_DISK_SIZE]
                      [--worker-machine-type WORKER_MACHINE_TYPE] [--zone ZONE]
                      [--properties PROPERTIES] [--metadata METADATA]
-                     [--jar JAR] [--zip ZIP] [--init INIT] [--vep]
+                     [--packages PACKAGES] [--jar JAR] [--zip ZIP]
+                     [--init INIT] [--vep]
                      name
 
 Start a Dataproc cluster configured for Hail.
@@ -201,6 +208,9 @@ optional arguments:
                         Additional configuration properties for the cluster
   --metadata METADATA   Comma-separated list of metadata to add:
                         KEY1=VALUE1,KEY2=VALUE2...
+  --packages PACKAGES, --pkgs PACKAGES
+                        Comma-separated list of Python packages to be
+                        installed on the master node.
   --jar JAR             Hail jar to use for Jupyter notebook.
   --zip ZIP             Hail zip to use for Jupyter notebook.
   --init INIT           Comma-separated list of init scripts to run.

diff --git a/cloudtools/init_notebook.py b/cloudtools/init_notebook.py
@@ -1,165 +1,143 @@
-#!/usr/bin/env python
-
+#!/usr/bin/python
 import os
 import json
-import time
-from subprocess import call, check_output
+from subprocess import check_output, call
 
 # get role of machine (master or worker)
 role = check_output(['/usr/share/google/get_metadata_value', 'attributes/dataproc-role'])
 
 # initialization actions to perform on master machine only
 if role == 'Master':
 
-    # download Anaconda Python 2.7 installation script
-    call(['wget', '-P', '/home/anaconda2/', 'https://repo.continuum.io/archive/Anaconda2-4.3.1-Linux-x86_64.sh'])
-
-    # install Anaconda in /home/anaconda2/
-    call(['bash', '/home/anaconda2/Anaconda2-4.3.1-Linux-x86_64.sh', '-b', '-f', '-p', '/home/anaconda2/'])
-    os.chmod('/home/anaconda2/', 0777)
-
-    # additional packages to install
-    pkgs = [
-        'lxml',
-        'jupyter-spark',
-        'jgscm'
-    ]
-
-    # use pip to install packages
-    for pkg in pkgs:
-        call(['/home/anaconda2/bin/pip', 'install', pkg])
-
-    # get Hail hash and Spark version to use for Jupyter notebook, if set through cluster startup metadata
-    spark = check_output(['/usr/share/google/get_metadata_value', 'attributes/SPARK'])
-    hail_version = check_output(['/usr/share/google/get_metadata_value', 'attributes/HAIL_VERSION'])
-    hash_name = check_output(['/usr/share/google/get_metadata_value', 'attributes/HASH'])
-
-    # default to Spark 2.0.2 if not otherwise specified through metadata
-    if not spark:
-        spark = '2.0.2'
-
-    # default to version 0.1
-    if not hail_version:
-        hail_version = '0.1'
-
-    # default to latest Hail build if none specified through metadata
-    if not hash_name:
-        hash_name = check_output(['gsutil', 'cat', 'gs://hail-common/builds/{0}/latest-hash-spark-{1}.txt'.format(hail_version, spark)]).strip()
-
-    # Hail jar
-    try:
-        custom_jar = check_output(['/usr/share/google/get_metadata_value', 'attributes/JAR'])
-    except:
-        hail_jar = 'hail-{0}-{1}-Spark-{2}.jar'.format(hail_version, hash_name, spark)
-        jar_path = 'gs://hail-common/builds/{0}/jars/{1}'.format(hail_version, hail_jar)
-    else:
-        hail_jar = custom_jar.rsplit('/')[-1]
-        jar_path = custom_jar
-
-    # Hail zip
-    try:
-        custom_zip = check_output(['/usr/share/google/get_metadata_value', 'attributes/ZIP'])
-    except:
-        hail_zip = 'hail-{0}-{1}.zip'.format(hail_version, hash_name)
-        zip_path = 'gs://hail-common/builds/{0}/python/{1}'.format(hail_version, hail_zip)
-    else:
-        hail_zip = custom_zip.rsplit('/')[-1]
-        zip_path = custom_zip
-
-    # make directory for Hail and Jupyter notebook related files
-    if not os.path.isdir('/home/hail/'):
-        os.mkdir('/home/hail/')
-    os.chmod('/home/hail/', 0777)
-
-    # copy Hail jar and zip to local directory on master node
-    call(['gsutil', 'cp', jar_path, '/home/hail/'])
-    call(['gsutil', 'cp', zip_path, '/home/hail/'])
-
-    # create Jupyter kernel spec file
-    kernel = {
-        'argv': [
-            '/home/anaconda2/bin/python',
-            '-m',
-            'ipykernel',
-            '-f',
-            '{connection_file}'
-        ],
-        'display_name': 'Hail',
-        'language': 'python',
-        'env': {
-            'PYTHONHASHSEED': '0',
-            'SPARK_HOME': '/usr/lib/spark/',
-            'SPARK_CONF_DIR': '/home/hail/conf/',
-            'PYTHONPATH': '/usr/lib/spark/python/:/usr/lib/spark/python/lib/py4j-0.10.3-src.zip:/home/hail/{}'.format(hail_zip)
-        }
-    }    
-
-    # write kernel spec file to default Jupyter kernel directory
-    os.makedirs('/home/anaconda2/share/jupyter/kernels/hail/')
-    with open('/home/anaconda2/share/jupyter/kernels/hail/kernel.json', 'wb') as f:
-        json.dump(kernel, f)
-
-    # make directory for custom Spark conf
-    os.mkdir('/home/hail/conf')
-
-    # copy conf files to custom directory
-    call(['cp', '/etc/spark/conf/spark-defaults.conf', '/home/hail/conf/spark-defaults.conf'])
-    call(['cp', '/etc/spark/conf/spark-env.sh', '/home/hail/conf/spark-env.sh'])
-
-    # modify custom Spark conf file to reference Hail jar and zip
-    with open('/home/hail/conf/spark-defaults.conf', 'ab') as f:
-        opts = [
-            'spark.files=/home/hail/{}'.format(hail_jar),
-            'spark.submit.pyFiles=/home/hail/{}'.format(hail_zip),
-            'spark.driver.extraClassPath=./{}'.format(hail_jar),
-            'spark.executor.extraClassPath=./{}'.format(hail_jar)
-        ]
-        f.write('\n'.join(opts))
-
-    # add Spark variable designating Anaconda Python executable as the default on driver, in both custom and default conf files
-    with open('/home/hail/conf/spark-env.sh', 'ab') as f_custom, open('/etc/spark/conf/spark-env.sh', 'ab') as f_default:
-        f_custom.write('PYSPARK_DRIVER_PYTHON=/home/anaconda2/bin/python' + '\n')
-        f_default.write('PYSPARK_DRIVER_PYTHON=/home/anaconda2/bin/python' + '\n')
-
-    # create Jupyter configuration file
-    call(['mkdir', '-p', '/home/anaconda2/etc/jupyter/'])
-    with open('/home/anaconda2/etc/jupyter/jupyter_notebook_config.py', 'wb') as f:
-        opts = [
-            'c.Application.log_level = "DEBUG"',
-            'c.NotebookApp.ip = "127.0.0.1"',
-            'c.NotebookApp.open_browser = False',
-            'c.NotebookApp.port = 8123',
-            'c.NotebookApp.token = ""',
-            'c.NotebookApp.contents_manager_class = "jgscm.GoogleStorageContentManager"'
-        ]
-        f.write('\n'.join(opts) + '\n')
-
-    # setup jupyter-spark extension
-    call(['/home/anaconda2/bin/jupyter', 'serverextension', 'enable', '--user', '--py', 'jupyter_spark'])
-    call(['/home/anaconda2/bin/jupyter', 'nbextension', 'install', '--user', '--py', 'jupyter_spark'])
-    call(['/home/anaconda2/bin/jupyter', 'nbextension', 'enable', '--user', '--py', 'jupyter_spark'])
-    call(['/home/anaconda2/bin/jupyter', 'nbextension', 'enable', '--user', '--py', 'widgetsnbextension'])
-
-    # create systemd service file for Jupyter notebook server process
-    with open('/lib/systemd/system/jupyter.service', 'wb') as f:
-    	opts = [
-    		'[Unit]',
-    		'Description=Jupyter Notebook',
-    		'After=hadoop-yarn-resourcemanager.service',
-    		'[Service]',
-    		'Type=simple',
-    		'User=root',
-    		'Group=root',
-    		'WorkingDirectory=/home/hail/',
-            'ExecStart=/home/anaconda2/bin/python /home/anaconda2/bin/jupyter notebook',
-    		'Restart=always',
-    		'RestartSec=1',
-    		'[Install]',
-    		'WantedBy=multi-user.target'
-    	]
-    	f.write('\n'.join(opts) + '\n')
-
-    # add Jupyter service to autorun and start it
-    call(['systemctl', 'daemon-reload'])
-    call(['systemctl', 'enable', 'jupyter'])
-    call(['service', 'jupyter', 'start'])
+	# install pip
+	call(['apt-get', 'update'])
+	call(['apt-get', 'install', '-y', 'python-dev'])
+	call(['apt-get', 'install', '-y', 'python-pip'])
+	call(['pip', 'install', '--upgrade', 'pip'])
+
+	# additional packages to install
+	pkgs = [
+		'decorator',
+		'jupyter',
+		'lxml',
+		'jupyter-spark',
+		'jgscm'
+	]
+
+	# add user-requested packages
+	try:
+		user_pkgs = check_output(['/usr/share/google/get_metadata_value', 'attributes/PKGS'])
+	except:
+		pass
+	else:
+		pkgs.extend(user_pkgs.split(','))
+
+	# use pip to install packages
+	for pkg in pkgs:
+		call(['pip', 'install', '--upgrade', pkg])
+
+	# get Hail hash and Spark version to use for Jupyter notebook, if set through cluster startup metadata
+	spark = check_output(['/usr/share/google/get_metadata_value', 'attributes/SPARK'])
+	hail_version = check_output(['/usr/share/google/get_metadata_value', 'attributes/HAIL_VERSION'])
+	hash_name = check_output(['/usr/share/google/get_metadata_value', 'attributes/HASH'])
+
+	# Hail jar
+	try:
+		custom_jar = check_output(['/usr/share/google/get_metadata_value', 'attributes/JAR'])
+	except:
+		hail_jar = 'hail-{0}-{1}-Spark-{2}.jar'.format(hail_version, hash_name, spark)
+		jar_path = 'gs://hail-common/builds/{0}/jars/{1}'.format(hail_version, hail_jar)
+	else:
+		hail_jar = custom_jar.rsplit('/')[-1]
+		jar_path = custom_jar
+
+	# Hail zip
+	try:
+		custom_zip = check_output(['/usr/share/google/get_metadata_value', 'attributes/ZIP'])
+	except:
+		hail_zip = 'hail-{0}-{1}.zip'.format(hail_version, hash_name)
+		zip_path = 'gs://hail-common/builds/{0}/python/{1}'.format(hail_version, hail_zip)
+	else:
+		hail_zip = custom_zip.rsplit('/')[-1]
+		zip_path = custom_zip
+
+	# copy Hail jar and zip to local directory on master node
+	call(['gsutil', 'cp', jar_path, '/usr/lib/spark/jars/'])
+	call(['gsutil', 'cp', zip_path, '/usr/lib/spark/python/'])
+
+	# modify custom Spark conf file to reference Hail jar and zip
+	with open('/etc/spark/conf/spark-defaults.conf', 'a') as f:
+		opts = [
+			'spark.files=/usr/lib/spark/jars/{}'.format(hail_jar),
+			'spark.submit.pyFiles=/usr/lib/spark/python/{}'.format(hail_zip),
+			'spark.driver.extraClassPath=./{}'.format(hail_jar),
+			'spark.executor.extraClassPath=./{}'.format(hail_jar)
+		]
+		f.write('\n'.join(opts))
+
+	# create Jupyter kernel spec file
+	kernel = {
+		'argv': [
+			'/usr/bin/python',
+			'-m',
+			'ipykernel',
+			'-f',
+			'{connection_file}'
+		],
+		'display_name': 'Hail',
+		'language': 'python',
+		'env': {
+			'PYTHONHASHSEED': '0',
+			'SPARK_HOME': '/usr/lib/spark/',
+			'SPARK_CONF_DIR': '/etc/spark/conf/',
+			'PYTHONPATH': '/usr/lib/spark/python/:/usr/lib/spark/python/lib/py4j-0.10.3-src.zip:/usr/lib/spark/python/{}'.format(hail_zip)
+		}
+	}
+
+	# write kernel spec file to default Jupyter kernel directory
+	os.mkdir('/usr/local/share/jupyter/kernels/hail/')
+	with open('/usr/local/share/jupyter/kernels/hail/kernel.json', 'w') as f:
+		json.dump(kernel, f)
+
+	# create Jupyter configuration file
+	os.mkdir('/usr/local/etc/jupyter/')
+	with open('/usr/local/etc/jupyter/jupyter_notebook_config.py', 'w') as f:
+		opts = [
+			'c.Application.log_level = "DEBUG"',
+			'c.NotebookApp.ip = "127.0.0.1"',
+			'c.NotebookApp.open_browser = False',
+			'c.NotebookApp.port = 8123',
+			'c.NotebookApp.token = ""',
+			'c.NotebookApp.contents_manager_class = "jgscm.GoogleStorageContentManager"'
+		]
+		f.write('\n'.join(opts) + '\n')
+
+	# setup jupyter-spark extension
+	call(['/usr/local/bin/jupyter', 'serverextension', 'enable', '--user', '--py', 'jupyter_spark'])
+	call(['/usr/local/bin/jupyter', 'nbextension', 'install', '--user', '--py', 'jupyter_spark'])
+	call(['/usr/local/bin/jupyter', 'nbextension', 'enable', '--user', '--py', 'jupyter_spark'])
+	call(['/usr/local/bin/jupyter', 'nbextension', 'enable', '--user', '--py', 'widgetsnbextension'])
+
+	# create systemd service file for Jupyter notebook server process
+	with open('/lib/systemd/system/jupyter.service', 'w') as f:
+		opts = [
+			'[Unit]',
+			'Description=Jupyter Notebook',
+			'After=hadoop-yarn-resourcemanager.service',
+			'[Service]',
+			'Type=simple',
+			'User=root',
+			'Group=root',
+			'WorkingDirectory=/usr/local/',
+			'ExecStart=/usr/bin/python /usr/local/bin/jupyter notebook --allow-root',
+			'Restart=always',
+			'RestartSec=1',
+			'[Install]',
+			'WantedBy=multi-user.target'
+		]
+		f.write('\n'.join(opts) + '\n')
+
+	# add Jupyter service to autorun and start it
+	call(['systemctl', 'daemon-reload'])
+	call(['systemctl', 'enable', 'jupyter'])
+	call(['service', 'jupyter', 'start'])	
diff --git a/cloudtools/start.py b/cloudtools/start.py
@@ -60,6 +60,8 @@ def init_parser(parser):
                         help='Additional configuration properties for the cluster')
     parser.add_argument('--metadata', default='',
                         help='Comma-separated list of metadata to add: KEY1=VALUE1,KEY2=VALUE2...')
+    parser.add_argument('--packages', '--pkgs', default='',
+                        help='Comma-separated list of Python packages to be installed on the master node.')
 
     # specify custom Hail jar and zip
     parser.add_argument('--jar', help='Hail jar to use for Jupyter notebook.')
@@ -102,13 +104,15 @@ def main(args):
     # default initialization script to start up cluster with
     init_actions = 'gs://hail-common/init_notebook-{}.py'.format(COMPATIBILITY_VERSION)
 
-    if args.init:
-        init_actions += ',' + args.init
-
-    # add VEP action
+    # add VEP init script
     if args.vep:
         init_actions += ',' + 'gs://hail-common/vep/vep/vep85-init.sh'
 
+    # add custom init scripts
+    if args.init:
+        init_actions += ',' + args.init
+
+    # get Hail build (default to latest)
     if args.hash == 'latest':
         hail_hash = check_output(['gsutil', 'cat', 'gs://hail-common/builds/{0}/latest-hash-spark-{1}.txt'.format(args.version, args.spark)]).strip()
     else:
@@ -125,6 +129,10 @@ def main(args):
     if args.zip:
         metadata += ',ZIP={}'.format(args.zip)
 
+    # if Python packages requested, add metadata variable
+    if args.packages:
+        metadata = '^:^' + metadata.replace(',', ':') + ':PKGS={}'.format(args.packages)
+
     # command to start cluster
     cmd = [
         'gcloud', 'dataproc', 'clusters', 'create',