Skip to content

Latest commit

 

History

History
385 lines (267 loc) · 10.3 KB

python.md

File metadata and controls

385 lines (267 loc) · 10.3 KB

conda update package

conda install -c bioconda scanpy=1.5.0

get version

module.__version__ # module version
import sys; sys.version() # python version within script

reload module

import importlib
importlib.reload(module)

module search path

import sys
sys.path

sys.path.append('additional-module-path') # add module path

print docstrings and annotation

print(func.__doc__)
print(func.__annotations__)
Name spaces and scope, Python follows LEGB rule when it tries to get variable value
Local -> Enclosed -> Global -> Built-in
modules vs classes

TLDR

  • Use classes as blueprints for objects that model your problem domain, create exceptions, OOP design patters
  • Use modules to collect functionality into logical units
Python caching
# Fibonacci recursive caching (original recursive is slow as intermediate results are not saved!; from Python for Finance) 

from functools import lru_cache as cache

@cache(maxsize=None)  1
def fib_rec_py2(n):
    if n < 2:
       return n
    else:
       return fib_rec_py2(n - 1) + fib_rec_py2(n - 2)
Date Time
import datetime
x = datetime.datetime.now()
print(x.strftime("%Y%m%d")) #20200723

executing modules as scripts

def main():
    print('hello")

if __name__ == '__main__':
    main()

# on cmd line
python script.py

catch all errors

import sys

try:
    f = open('myfile.txt')
    s = f.readline()
    i = int(s.strip())
except OSError as err:
    print("OS error: {0}".format(err))
except ValueError:
    print("Could not convert data to an integer.")
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

get class and module attributes

dir(myclass)

Hiding credentials, config from users

Empty init.py in folder

Just there to tell Python that it's a package.

DRY principle (Don't repeat yourself):

  1. Write functions for a task that is performed over and over.
  2. Create loops that iterative over repetitive tasks.
  3. Use conditional statements to control if and when code is executed.

idiomatic python,awk

Readable code, style guide (PEP), my code isnt working

Python skeleton/template for apps

setting up jupyter notebook on a compute node (https://oncomputingwell.princeton.edu/2018/05/jupyter-on-the-cluster/)

## on compute node
    module load anaconda3

# For Jupyter Lab:
    jupyter-lab --no-browser --port=8889 --ip=0.0.0.0

# For Jupyter Notebook:
    jupyter-notebook --no-browser --port=8889 --ip=0.0.0.0
 
# get hostname
    hostname

## on local machine
    ssh -N -f -L 8889:<hostname>:8889 <yourusername>@<hpc remote server>

# to kill background ssh tunnel
lsof -i tcp:8889 # get PID
kill -9 <PID>

Running bash commands

import subprocess
import shlex

cmd = 'aws s3 cp --recursive {} {}'.format(s3_path, directory_to_download)
subprocess.check_call(shlex.split(cmd))

pandas groupby value_counts

df = pd.DataFrame([
    (1, 1, 'term1'),
    (1, 2, 'term2'),
    (1, 1, 'term1'),
    (1, 1, 'term2'),
    (2, 2, 'term3'),
    (2, 3, 'term1'),
    (2, 2, 'term1')
], columns=['id', 'group', 'term'])

df.groupby(['id', 'group', 'term']).size().unstack(fill_value=0)

# alternative
df.pivot_table(index=['id','group'], columns='term', aggfunc='size', fill_value=0)

strategy for reading large csv pandas

  • Chunking your data
import pandas as pd

# reach chunk
data_iterator = pd.read_csv("large_data.csv", chunksize=100000)

chunk_list = []  

# Each chunk is in dataframe format
for data_chunk in data_iterator:
    # process chuck and save results
    filtered_chunk = chunk_filtering(data_chunk)
    chunk_list.append(filtered_chunk)

# combine chunks
filtered_data = pd.concat(chunk_list)
  • Dropping data
use_cols = ["stock_price", "stock_volume", "stock_symbol", "dividend", "eps"]
ignore_cols = ["stock_name", "data_of_ipo"]

df = pd.read_csv("large_data.csv", usecols=use_cols) 
  • Set specific data types for each column
df = pd.read_csv("large_data.csv", dtype={'column_A': np.int32, 'column_B': np.float16})

Sort every column in pandas df

df.transform(np.sort)

pygsheet

Detect duplicates numpy

import numpy as np
from collections import Counter
a = np.array([1, 2, 1, 3, 3, 3, 0])
[item for item, count in Counter(a).iteritems() if count > 1]   
> [1, 3]

# using pandas
u, c = np.unique(a, return_counts=True)
dup = u[c > 1]

Recursion

# Looking for repetitious keys in nested dictionary

def findKey(obj, key):
    for k, v in obj.items():
        if isinstance(v, dict):
            findKey(v, key)
        else:
        if key in obj:
            print(obj[key])
 
myDic = {"A":{"A": 1, "B":{"B": 2, "C":{"C": 3}}}, "D": 4}
findKey(myDic, "A")
> 1
findKey(myDic, "B")
> 3

# on lists

findNext = lambda x, obj: -1 if len(obj) == 0 \
    or len(obj) == 1 \
    else obj[1] if x == obj[0] \
    else findNext(x, obj[1:])
findPrev = lambda x, obj: -1 if len(obj) == 0 \
    or len(obj) == 1 \
    else obj[0] if x == obj[1] \
    else findPrev(x, obj[1:])

myList = [1,2,3,4,5,6]
findNext(4, myList)
> 5
findPrev(4, myList)
> 3
findNext(4, [])
> -1
findNext(4, [3,4]
> -1
  • Functional program for dummies

Passing functions in Python

def doAdd(x, y):
    return x + y
def doSub(x, y):
    return x - y
def compareWithHundred(function, x, y):
    z = function(x, y)
    out = lambda x: "GT" if 100 > x \
    else "EQ" if 100 == x else "LT"
    return out(z)

doAdd(99,2)
> 101
doSub(99,2)
> 97
compareWithHundred(doAdd, 99, 2)
> 'LT'
compareWithHundred(doSub, 99, 2)
> 'GT'
compareWithHundred(doAdd, 99, 1)
> 'EQ'
  • Functional program for dummies

Python design patterns

Seaborn

import seaborn as sns
sns.relplot(x="gex-umi-sum-ash", 
        y="hto-umi-sum-ash", 
        hue="filter-2", 
        data=df['meta_cell'].loc[keep_barcodes].sample(frac=0.5),
        s=6,
        height=8,
       aspect=1.3)

# set figsize
f, ax = plt.subplots(figsize=(7, 3))
sns.countplot(y="deck", data=titanic, color="c")

# rotate labels
plt.figure(figsize=(10,5))
chart = sns.countplot(
    data=data[data['Year'] == 1980],
    x='Sport',
    palette='Set1'
)
chart.set_xticklabels(chart.get_xticklabels(), 
rotation=45
horizontalalignment='right')

Matplotlib

# set tick color
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(5,4))
ax.plot([1,2,3])

ax.get_xticklabels()[3].set_color("red")

plt.show()

# figsize and subplots; eg. plots 3 fig on same row
import matplotlib.pyplot as plt
for col in df['hto-clr'].columns.tolist():
    plt.figure(figsize=(10,3))
    
    plt.subplot(1, 3, 1)
    df['hto-clr'][col].plot.hist(bins=100,title=col + ' CLR')

    plt.subplot(1, 3, 2)
    df['hto-ash'][col].plot.hist(bins=100,title=col + ' ASINH')

    plt.subplot(1, 3, 3)
    df['hto-gt'][col].plot.hist(bins=100,title=col + ' GEO MEAN')

    plt.tight_layout() # so they don't overlap
    plt.show()
    
# remove legend
ax.get_legend().remove()

# hue and hue order
palette ={"unassigned": "C0", "doublet": "C1", "singlet": "C2"}
hue_order = ["unassigned","doublet","singlet"]
sns.relplot(x="gex-umi-sum-ash", 
            y="hto-umi-sum-ash", 
            hue="souporcell_status",
            hue_order=hue_order,
            data=df['meta_cell'].sample(frac=0.2),
            s=6,
            height=5,
           aspect=1.3)
plt.title('souporcell singlets among all barcodes')