Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #90

Merged
merged 27 commits into from
Oct 31, 2019
Merged

Dev #90

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
0d3177b
intermediate commit
davidsebfischer Sep 5, 2019
f034d71
added tf2 specific unit tests
picciama Oct 22, 2019
2179592
excluded norm and beta tests
picciama Oct 22, 2019
aa4b863
intermediate commit
davidsebfischer Oct 27, 2019
69a2c19
cosmetic changes
davidsebfischer Oct 27, 2019
c47d071
fixed small bugs, lgamma does not work yet
davidsebfischer Oct 27, 2019
8046122
towards parallel evaluation of numpy code across genes
davidsebfischer Oct 27, 2019
0036121
tf -> tf1
kadam0 Oct 28, 2019
2252752
added tr_radius decrease for IRLS_GD_TR
picciama Oct 28, 2019
c2fbd70
Merge branch 'tf2_compatibility' of github.com:theislab/batchglm into…
kadam0 Oct 28, 2019
704b308
added newline
kadam0 Oct 28, 2019
c4d4c1b
change import order in api
kadam0 Oct 28, 2019
cc90f7a
changes in api
kadam0 Oct 28, 2019
85a8e66
implemented api for tf1 and tf2 (default:tf1)
kadam0 Oct 28, 2019
db34de6
add missing api.tf1.train and api.tf2.train files
kadam0 Oct 28, 2019
c5a150e
numpy graph checks out
davidsebfischer Oct 28, 2019
0eaeaf6
added 4 notebooks for benchmarking
picciama Oct 28, 2019
6f168be
Merge branch 'tf2_compatibility' of github.com:theislab/batchglm into…
picciama Oct 28, 2019
3736a1e
numpy based NB accuracy unit test checks out
davidsebfischer Oct 30, 2019
d16070f
added feature-wise convergence to numpy fitting
davidsebfischer Oct 30, 2019
5674d2f
Merge branch 'numpy_linalg' into tf2_compatibility
davidsebfischer Oct 30, 2019
066f516
improvements to numpy fitting code and compat changes
davidsebfischer Oct 31, 2019
a450383
fixed model api paths
davidsebfischer Oct 31, 2019
3a70089
batchglm numpy backend support functional from diffxpy
davidsebfischer Oct 31, 2019
d01d1e1
made tf an optional dependency
davidsebfischer Oct 31, 2019
5b5b877
depreceated tutorials submodule
davidsebfischer Oct 31, 2019
37b4055
Merge pull request #89 from theislab/tf2_compatibility
davidsebfischer Oct 31, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ resources/*
**/__pycache__
**/.DS_Store
batchglm.egg-info
**/*.ipynb
#**/*.ipynb
tutorials

!**/.gitignore

10 changes: 5 additions & 5 deletions batchglm/api/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from .. import __version__
from ..log_cfg import logger, unconfigure_logging, setup_logging

from . import models
from . import data
from . import models
from . import typing
from . import utils
from .. import pkg_constants

from .. import __version__
from ..log_cfg import logger, unconfigure_logging, setup_logging
from .. import pkg_constants
5 changes: 2 additions & 3 deletions batchglm/api/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
from . import glm_nb
from . import glm_norm
from . import glm_beta
from . import numpy
from . import tf1
1 change: 1 addition & 0 deletions batchglm/api/models/numpy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from . import glm_nb
2 changes: 2 additions & 0 deletions batchglm/api/models/numpy/glm_nb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from batchglm.models.glm_nb import InputDataGLM, Model, Simulator
from batchglm.train.numpy.glm_nb import Estimator
3 changes: 3 additions & 0 deletions batchglm/api/models/tf1/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from . import glm_beta
from . import glm_nb
from . import glm_norm
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from batchglm.models.glm_beta import InputDataGLM, Model, Simulator
from batchglm.train.tf.glm_beta import Estimator
from batchglm.train.tf1.glm_beta import Estimator
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from batchglm.models.glm_nb import InputDataGLM, Model, Simulator
from batchglm.train.tf.glm_nb import Estimator
from batchglm.train.tf1.glm_nb import Estimator
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from batchglm.models.glm_norm import InputDataGLM, Model, Simulator
from batchglm.train.tf.glm_norm import Estimator
from batchglm.train.tf1.glm_norm import Estimator
27 changes: 1 addition & 26 deletions batchglm/models/base/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def initialize(self, **kwargs):
pass

@abc.abstractmethod
def train(self, learning_rate=None, **kwargs):
def train(self, **kwargs):
"""
Starts the training routine
"""
Expand All @@ -103,31 +103,6 @@ def finalize(self, **kwargs):
"""
pass

def train_sequence(self, training_strategy=TrainingStrategy.AUTO):
"""
Starts a sequence of training routines

:param training_strategy: List of dicts or enum with parameters which will be passed to self.train().

- `training_strategy = [ {"learning_rate": 0.5}, {"learning_rate": 0.05} ]` is equivalent to
`self.train(learning_rate=0.5); self.train(learning_rate=0.05);`

- Can also be an enum: self.TrainingStrategy.[AUTO|DEFAULT|EXACT|QUICK|...]
- Can also be a str: "[AUTO|DEFAULT|EXACT|QUICK|...]"
"""
if isinstance(training_strategy, Enum):
training_strategy = training_strategy.value
elif isinstance(training_strategy, str):
training_strategy = self.TrainingStrategy[training_strategy].value

if training_strategy is None:
training_strategy = self.TrainingStrategy.DEFAULT.value

for idx, d in enumerate(training_strategy):
logger.info("Beginning with training sequence #%d", idx + 1)
self.train(**d)
logger.info("Training sequence #%d complete", idx + 1)

def _plot_coef_vs_ref(
self,
true_values: np.ndarray,
Expand Down
4 changes: 4 additions & 0 deletions batchglm/models/base/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ def __init__(
):
self.input_data = input_data

@property
def x(self):
return self.input_data.x

def get(self, key: Union[str, Iterable]) -> Union[Any, Dict[str, Any]]:
"""
Returns the values specified by key.
Expand Down
16 changes: 16 additions & 0 deletions batchglm/models/base_glm/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,22 @@ def location(self):
def scale(self):
return self.inverse_link_scale(self.eta_scale)

@abc.abstractmethod
def eta_loc_j(self, j) -> np.ndarray:
pass

def eta_scale_j(self, j) -> np.ndarray:
# Make sure that dimensionality of sliced array is kept:
if isinstance(j, int) or isinstance(j, np.int32) or isinstance(j, np.int64):
j = [j]
return np.matmul(self.design_scale, self.b[:, j])

def location_j(self, j):
return self.inverse_link_loc(self.eta_loc_j(j=j))

def scale_j(self, j):
return self.inverse_link_scale(self.eta_scale_j(j=j))

@property
def size_factors(self) -> Union[np.ndarray, None]:
if self.input_data is None:
Expand Down
6 changes: 5 additions & 1 deletion batchglm/models/base_glm/simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def generate_sample_description(
self,
num_conditions=2,
num_batches=4,
intercept_scale: bool = False,
**kwargs
):
self.sim_design_loc, self.sample_description = generate_sample_description(
Expand All @@ -87,7 +88,10 @@ def generate_sample_description(
num_batches=num_batches,
**kwargs
)
self.sim_design_scale = self.sim_design_loc
if intercept_scale:
self.sim_design_scale = patsy.dmatrix("~1", self.sample_description)
else:
self.sim_design_scale = self.sim_design_loc

def _generate_params(
self,
Expand Down
9 changes: 9 additions & 0 deletions batchglm/models/glm_nb/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@ def eta_loc(self) -> np.ndarray:
eta += np.expand_dims(self.size_factors, axis=1)
return eta

def eta_loc_j(self, j) -> np.ndarray:
# Make sure that dimensionality of sliced array is kept:
if isinstance(j, int) or isinstance(j, np.int32) or isinstance(j, np.int64):
j = [j]
eta = np.matmul(self.design_loc, self.a[:, j])
if self.size_factors is not None:
eta += np.expand_dims(self.size_factors, axis=1)
return eta

# Re-parameterizations:

@property
Expand Down
2 changes: 2 additions & 0 deletions batchglm/pkg_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
TRUST_REGION_T2 = 1.5 # Allow expansion if not shrinking.
TRUST_REGION_UPPER_BOUND = 1e5

TRUST_REGIONT_T1_IRLS_GD_TR_SCALE = 1

# Convergence hyper-parameters:
LLTOL_BY_FEATURE = 1e-10
XTOL_BY_FEATURE_LOC = 1e-8
Expand Down
1 change: 1 addition & 0 deletions batchglm/train/numpy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from . import glm_nb as nb
4 changes: 4 additions & 0 deletions batchglm/train/numpy/base_glm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .processModel import ProcessModelGlm
from .model import ModelIwls
from .estimator import EstimatorGlm
from .vars import ModelVarsGlm
211 changes: 211 additions & 0 deletions batchglm/train/numpy/base_glm/estimator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
import abc
import logging
import numpy as np
import pprint
import scipy
import scipy.optimize

from .external import _EstimatorGLM, pkg_constants
from .training_strategies import TrainingStrategies

logger = logging.getLogger("batchglm")


class EstimatorGlm(_EstimatorGLM, metaclass=abc.ABCMeta):
"""
Estimator for Generalized Linear Models (GLMs).
"""

def __init__(
self,
model,
input_data,
dtype,
):
if input_data.design_scale.shape[1] != 1:
raise ValueError("cannot model more than one scale parameter with numpy backend right now.")
_EstimatorGLM.__init__(
self=self,
model=model,
input_data=input_data
)
self.dtype = dtype
self.values = []
self.lls = []

self.TrainingStrategies = TrainingStrategies

def initialize(self):
pass

def train_sequence(
self,
training_strategy: str = "DEFAULT"
):
if isinstance(training_strategy, str):
training_strategy = self.TrainingStrategies[training_strategy].value[0]

if training_strategy is None:
training_strategy = self.TrainingStrategies.DEFAULT.value

logging.getLogger("batchglm").info("training strategy:\n%s", pprint.pformat(training_strategy))
self.train(**training_strategy)

def train(
self,
max_steps: int,
update_b_freq: int = 5
):
# Iterate until conditions are fulfilled.
train_step = 0
delayed_converged = np.tile(False, self.model.model_vars.n_features)

ll_current = - self.model.ll_byfeature
logging.getLogger("batchglm").debug("iter %i: ll=%f" % (0, np.sum(ll_current)))
while np.any(np.logical_not(delayed_converged)) and \
train_step < max_steps:
# Update parameters:
# Line search step for scale model:
if train_step % update_b_freq == 0 and train_step > 0:
b_var_cache = self.model.b_var.copy()
self.model.b_var = self.b_step(idx=np.where(np.logical_not(delayed_converged))[0])
# Reverse update by feature if update leads to worse loss:
ll_proposal = - self.model.ll_byfeature
b_var_new = self.model.b_var.copy()
b_var_new[:, ll_proposal > ll_current] = b_var_cache[:, ll_proposal > ll_current]
self.model.b_var = b_var_new
delayed_b_converged = self.model.converged.copy()
# IWLS step for location model:
self.model.a_var = self.model.a_var + self.iwls_step()

# Evaluate convergence
ll_previous = ll_current
ll_current = - self.model.ll_byfeature
converged_f = (ll_previous - ll_current) / ll_previous < pkg_constants.LLTOL_BY_FEATURE
# Location model convergence status has to be updated if b model was updated
if train_step % update_b_freq == 0 and train_step > 0:
self.model.converged = converged_f
delayed_converged = converged_f
else:
self.model.converged = np.logical_or(self.model.converged, converged_f)
train_step += 1
logging.getLogger("batchglm").debug(
"iter %i: ll=%f, converged: %i" %
(train_step, np.sum(ll_current), np.sum(self.model.converged))
)
self.lls.append(ll_current)

def iwls_step(self) -> np.ndarray:
"""

:return: (inferred param x features)
"""
w = self.model.fim_weight_j(j=self.model.idx_not_converged) # (observations x features)
ybar = self.model.ybar_j(j=self.model.idx_not_converged) # (observations x features)
# Translate to problem of form ax = b for each feature:
# (in the following, X=design and Y=counts)
# a=X^T*W*X: ([features] x inferred param)
# x=theta: ([features] x inferred param)
# b=X^T*W*Ybar: ([features] x inferred param)
xh = np.matmul(self.model.design_loc, self.model.constraints_loc)
xhw = np.einsum('ob,of->fob', xh, w)
a = np.einsum('fob,oc->fbc', xhw, xh)
b = np.einsum('fob,of->fb', xhw, ybar)
# Via np.linalg.solve:
delta_theta = np.zeros_like(self.model.a_var)
delta_theta[:, self.model.idx_not_converged] = np.linalg.solve(a, b).T
# Via np.linalg.lsts:
#delta_theta[:, self.idx_not_converged] = np.concatenate([
# np.expand_dims(np.linalg.lstsq(a[i, :, :], b[i, :])[0], axis=-1)
# for i in self.idx_not_converged)
#], axis=-1)
# Via np.linalg.inv:
# #delta_theta[:, self.idx_not_converged] = np.concatenate([
# np.expand_dims(np.matmul(np.linalg.inv(a[i, :, :]), b[i, :]), axis=-1)
# for i in self.idx_not_converged)
#], axis=-1)
return delta_theta

def b_step(
self,
idx: np.ndarray,
linesearch: bool = False
) -> np.ndarray:
"""

:return:
"""
x0 = -10

def cost_b_var(x):
self.model.b_var_j_setter(value=x, j=j)
return - np.sum(self.model.ll_j(j=j))

def grad_b_var(x):
self.model.b_var_j_setter(value=x, j=j)
return - self.model.jac_b_j(j=j)

b_var_new = self.model.b_var.copy()
for j in idx:
if linesearch:
ls_result = scipy.optimize.line_search(
f=cost_b_var,
myfprime=grad_b_var,
xk=np.array([x0]),
pk=np.array([1.]),
gfk=None,
old_fval=None,
old_old_fval=None,
args=(),
c1=0.0001,
c2=0.9,
amax=50.,
extra_condition=None,
maxiter=1000
)
b_var_new[0, j] = x0 + ls_result[0]
else:
ls_result = scipy.optimize.minimize_scalar(
fun=cost_b_var,
args=(),
method='brent',
tol=None,
options={'maxiter': 500}
)
b_var_new[0, j] = ls_result["x"]

return b_var_new

def finalize(self):
"""
Evaluate all tensors that need to be exported from session and save these as class attributes
and close session.

Changes .model entry from tf1-based EstimatorGraph to numpy based Model instance and
transfers relevant attributes.
"""
# Read from numpy-IRLS estimator specific model:

self._hessian = self.model.hessian
self._fisher_inv = np.linalg.inv(- self._hessian)
self._jacobian = np.sum(np.abs(self.model.jac / self.model.x.shape[0]), axis=1)
self._log_likelihood = self.model.ll_byfeature
self._loss = np.sum(self._log_likelihood)

@abc.abstractmethod
def get_model_container(
self,
input_data
):
pass

@abc.abstractmethod
def init_par(
self,
input_data,
init_a,
init_b,
init_model
):
pass

Loading