diff --git a/docs/api/interval.md b/docs/api/interval.md
index 6b546ea..afe6d10 100644
--- a/docs/api/interval.md
+++ b/docs/api/interval.md
@@ -1,34 +1,5 @@
 # Interval Score
 
-## Interval or Winkler Score
-
-For a prediction interval (PI), the interval or Winkler score is given by:
-
-$$
-\text{IS} = \begin{cases}
-    (u - l) + \frac{2}{\alpha}(l - y)  & \text{for } y < l \\
-    (u - l)                             & \text{for } l \leq y \leq u \\
-    (u - l) + \frac{2}{\alpha}(y - u)   & \text{for } y > u. \\
-\end{cases}
-$$
-
-for an $(1 - \alpha)$PI of $[l, u]$ and the true value $y$ [@gneiting_strictly_2007, @bracher2021evaluating @winkler1972decision].
-
-## Weighted Interval Score
-
-The weighted interval score (WIS) is defined as
-
-$$
-\text{WIS}_{\alpha_{0:K}}(F, y) = \frac{1}{K+0.5}(w_0 \times |y - m| + \sum_{k=1}^K (w_k \times IS_{\alpha_k}(F, y)))
-$$
-
-where $m$ denotes the median prediction, $w_0$ denotes the weight of the median prediction, $IS_{\alpha_k}(F, y)$ denotes the interval score for the $1 - \alpha$ prediction interval and $w_k$ is the according weight. The WIS is calculated for a set of (central) PIs and the predictive median [@bracher2021evaluating]. The weights are an optional parameter and default weight is the canonical weight $w_k = \frac{2}{\alpha_k}$ and $w_0 = 0.5$. For these weights, it holds that:
-
-$$
-\text{WIS}_{\alpha_{0:K}}(F, y) \approx \text{CRPS}(F, y).
-$$
-
-
 ::: scoringrules.interval_score
 
 ::: scoringrules.weighted_interval_score
diff --git a/pyproject.toml b/pyproject.toml
index 77ad759..3a612a2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,9 +58,13 @@ dev-dependencies = [
 ]
 
 [tool.ruff]
+line-length = 88
 
 [tool.ruff.lint]
 ignore = ["E741"]
 
 [tool.coverage.run]
 omit = ["**/_gufuncs.py", "**/_gufunc.py"]
+
+[tool.coverage.report]
+exclude_also = ["if tp.TYPE_CHECKING:"]
diff --git a/scoringrules/_crps.py b/scoringrules/_crps.py
index 2ec277a..4846884 100644
--- a/scoringrules/_crps.py
+++ b/scoringrules/_crps.py
@@ -58,7 +58,12 @@ def crps_ensemble(
     if axis != -1:
         forecasts = B.moveaxis(forecasts, axis, -1)
 
-    if not sorted_ensemble and estimator not in ["nrg", "akr", "akr_circperm", "fair"]:
+    if not sorted_ensemble and estimator not in [
+        "nrg",
+        "akr",
+        "akr_circperm",
+        "fair",
+    ]:
         forecasts = B.sort(forecasts, axis=-1)
 
     if backend == "numba":
@@ -865,7 +870,14 @@ def crps_gtclogistic(
     >>> sr.crps_gtclogistic(0.0, 0.1, 0.4, -1.0, 1.0, 0.1, 0.1)
     """
     return crps.gtclogistic(
-        observation, location, scale, lower, upper, lmass, umass, backend=backend
+        observation,
+        location,
+        scale,
+        lower,
+        upper,
+        lmass,
+        umass,
+        backend=backend,
     )
 
 
@@ -953,7 +965,14 @@ def crps_clogistic(
     lmass = stats._logis_cdf((lower - location) / scale)
     umass = 1 - stats._logis_cdf((upper - location) / scale)
     return crps.gtclogistic(
-        observation, location, scale, lower, upper, lmass, umass, backend=backend
+        observation,
+        location,
+        scale,
+        lower,
+        upper,
+        lmass,
+        umass,
+        backend=backend,
     )
 
 
@@ -990,7 +1009,14 @@ def crps_gtcnormal(
     >>> sr.crps_gtcnormal(0.0, 0.1, 0.4, -1.0, 1.0, 0.1, 0.1)
     """
     return crps.gtcnormal(
-        observation, location, scale, lower, upper, lmass, umass, backend=backend
+        observation,
+        location,
+        scale,
+        lower,
+        upper,
+        lmass,
+        umass,
+        backend=backend,
     )
 
 
@@ -1078,7 +1104,14 @@ def crps_cnormal(
     lmass = stats._norm_cdf((lower - location) / scale)
     umass = 1 - stats._norm_cdf((upper - location) / scale)
     return crps.gtcnormal(
-        observation, location, scale, lower, upper, lmass, umass, backend=backend
+        observation,
+        location,
+        scale,
+        lower,
+        upper,
+        lmass,
+        umass,
+        backend=backend,
     )
 
 
@@ -1146,7 +1179,15 @@ def crps_gtct(
     >>> sr.crps_gtct(0.0, 2.0, 0.1, 0.4, -1.0, 1.0, 0.1, 0.1)
     """
     return crps.gtct(
-        observation, df, location, scale, lower, upper, lmass, umass, backend=backend
+        observation,
+        df,
+        location,
+        scale,
+        lower,
+        upper,
+        lmass,
+        umass,
+        backend=backend,
     )
 
 
@@ -1192,7 +1233,15 @@ def crps_tt(
     >>> sr.crps_tt(0.0, 2.0, 0.1, 0.4, -1.0, 1.0)
     """
     return crps.gtct(
-        observation, df, location, scale, lower, upper, 0.0, 0.0, backend=backend
+        observation,
+        df,
+        location,
+        scale,
+        lower,
+        upper,
+        0.0,
+        0.0,
+        backend=backend,
     )
 
 
@@ -1240,7 +1289,15 @@ def crps_ct(
     lmass = stats._t_cdf((lower - location) / scale, df)
     umass = 1 - stats._t_cdf((upper - location) / scale, df)
     return crps.gtct(
-        observation, df, location, scale, lower, upper, lmass, umass, backend=backend
+        observation,
+        df,
+        location,
+        scale,
+        lower,
+        upper,
+        lmass,
+        umass,
+        backend=backend,
     )
 
 
diff --git a/scoringrules/_interval.py b/scoringrules/_interval.py
index 4f8a530..749f21d 100644
--- a/scoringrules/_interval.py
+++ b/scoringrules/_interval.py
@@ -1,5 +1,5 @@
 import typing as tp
-from typing import Optional, Union
+from typing import Optional
 
 from scoringrules.backend import backends
 from scoringrules.core import interval
@@ -9,162 +9,177 @@
 
 
 def interval_score(
-    observations: "ArrayLike",
-    lower: "Array",
-    upper: "Array",
-    alpha: Union[float, "Array"],
-    /,
-    axis: int = -1,
+    obs: "ArrayLike",
+    lower: "ArrayLike",
+    upper: "ArrayLike",
+    alpha: "ArrayLike",
     *,
     backend: "Backend" = None,
 ) -> "Array":
-    r"""Compute the Interval Score or Winkler Score [(Gneiting & Raftery, 2012)](https://www.tandfonline.com/doi/abs/10.1198/016214506000001437) for 1 - $\alpha$ prediction intervals PI = [lower, upper].
-
-    The interval score is defined as
+    r"""Compute the Interval Score or Winkler Score.
 
-    $\text{IS} = \begin{cases}
-    (u - l) + \frac{2}{\alpha}(l - y)  & \text{for } y < l \\
-    (u - l)                             & \text{for } l \leq y \leq u \\
-    (u - l) + \frac{2}{\alpha}(y - u)   & \text{for } y > u. \\
-    \end{cases}$
+    The interval score
+    [(Gneiting & Raftery, 2012)](https://doi.org/10.1198/016214506000001437)
+    is defined as
 
-    for an $1 - \alpha$ PI of $[l, u]$ and the true value $y$.
+    $$
+    \text{IS} =
+        \begin{cases}
+        (u - l) + \frac{2}{\alpha}(l - y)  & \text{for } y < l \\
+        (u - l)                            & \text{for } l \leq y \leq u \\
+        (u - l) + \frac{2}{\alpha}(y - u)  & \text{for } y > u. \\
+        \end{cases}
+    $$
 
-    Note
-    ----
-    Note that alpha can be a float or an array of coverages.
-    In the case alpha is a float, the output will have the same shape as the observations and we assume that shape of observations,
-    upper and lower is the same. In case alpha is a vector, the function will broadcast observations accordingly.
+    for an $1 - \alpha$ prediction interval of $[l, u]$ and the true value $y$.
 
     Parameters
     ----------
-    observations: ArrayLike
-        The observed values.
-    lower: Array
-        The predicted lower bound of the prediction interval.
-    upper: Array
-        The predicted upper bound of the prediction interval.
-    alpha: Union[float, Array]
-        The 1 - alpha level for the prediction interval.
-    axis: int
-        The axis corresponding to the ensemble. Default is the last axis.
-    backend: str
+    obs:
+        The observations as a scalar or array of values.
+    lower:
+        The predicted lower bound of the PI as a scalar or array of values.
+    upper:
+        The predicted upper bound of the PI as a scalar or array of values.
+    alpha:
+        The 1 - alpha level for the PI as a scalar or array of values.
+    backend:
         The name of the backend used for computations. Defaults to 'numba' if available, else 'numpy'.
 
     Returns
     -------
-    score: Array
-        An array of interval scores for each prediction interval, which should be averaged to get meaningful values.
+    score:
+        Array with the interval score for the input values.
+
+    Raises
+    ------
+    ValueError:
+        If the lower and upper bounds do not have the same
+        shape or if the number of PIs does not match the number of alpha levels.
+
+    Notes
+    -----
+    Given an `obs` array of shape `(...,)`, in the case when multiple PIs are
+    evaluated `alpha` is an array of shape `(K,)`, then `lower` and `upper` must
+    have shape `(...,K)` and the output will have shape `(...,K)`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scoringrules as sr
+    >>> sr.interval_score(0.1, 0.0, 0.4, 0.5)
+    0.4
+
+    >>> sr.interval_score(
+    ...     obs=np.array([0.1, 0.2, 0.3]),
+    ...     lower=np.array([0.0, 0.1, 0.2]),
+    ...     upper=np.array([0.4, 0.3, 0.5]),
+    ...     alpha=0.5,
+    ... )
+    array([0.4, 0.2, 0.4])
+
+    >>> sr.interval_score(
+    ...     obs=np.random.uniform(size=(10,)),
+    ...     lower=np.ones((10,5)) * 0.2,
+    ...     upper=np.ones((10,5)) * 0.8,
+    ...     alpha=np.linspace(0.1, 0.9, 5),
+    ... ).shape
+    (10, 5)
     """
     B = backends.active if backend is None else backends[backend]
-    single_alpha = isinstance(alpha, float)
-
-    observations, lower, upper = map(B.asarray, (observations, lower, upper))
-
-    if axis != -1:
-        lower = B.moveaxis(lower, axis, -1)
-        upper = B.moveaxis(upper, axis, -1)
-
-    if single_alpha:
-        if B.name == "numba":
-            return interval._interval_score_gufunc(observations, lower, upper, alpha)
+    obs, lower, upper, alpha = map(B.asarray, (obs, lower, upper, alpha))
 
-        return interval._interval_score(
-            observations, lower, upper, alpha, backend=backend
+    if lower.shape != upper.shape:
+        raise ValueError(
+            "The lower and upper bounds must have the same shape."
+            f" Got lower {lower.shape} and upper {upper.shape}."
         )
 
-    else:
-        alpha = B.asarray(alpha)
-
-        if B.name == "numba":
-            return interval._interval_score_gufunc(
-                observations[..., None], lower, upper, alpha
+    if alpha.ndim == 1:
+        obs = obs[..., None]
+        if (lower.shape[-1] != alpha.shape[0]) or (upper.shape[-1] != alpha.shape[0]):
+            raise ValueError(
+                "The number of PIs does not match the number of alpha levels."
+                f" Got lower and upper of shapes {lower.shape}"
+                f" for alpha of shape {alpha.shape}."
             )
 
-        return interval._interval_score(
-            observations[..., None], lower, upper, alpha, backend=backend
-        )
+    res = interval.interval_score(obs, lower, upper, alpha)
+    return B.squeeze(res)
 
 
 def weighted_interval_score(
-    observations: "ArrayLike",
+    obs: "ArrayLike",
     median: "Array",
     lower: "Array",
     upper: "Array",
     alpha: "Array",
     /,
-    weight_median: Optional[float] = None,
-    weight_alpha: Optional["Array"] = None,
-    axis: int = -1,
+    w_median: Optional[float] = None,
+    w_alpha: Optional["Array"] = None,
     *,
     backend: "Backend" = None,
 ) -> "Array":
-    r"""Compute the Interval Score or Winkler Score [(Bracher J, Ray EL, Gneiting T, Reich NG, 2022)](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1008618) for 1 - $\alpha$ prediction intervals PI = [lower, upper].
+    r"""Compute the weighted interval score (WIS).
+
+    The WIS [(Bracher et al., 2022)](https://doi.org/10.1371/journal.pcbi.1008618)
+    is defined as
 
-    The weighted interval score (WIS) is defined as
+    $$
+    \text{WIS}_{\alpha_{0:K}}(F, y) = \frac{1}{K+0.5}(w_0 \times |y - m|
+    + \sum_{k=1}^K (w_k \times IS_{\alpha_k}(F, y)))
+    $$
 
-    $\text{WIS}_{\alpha_{0:K}}(F, y) = \frac{1}{K+0.5}(w_0 \times |y - m| + \sum_{k=1}^K (w_k \times IS_{\alpha_k}(F, y)))$
+    where $m$ denotes the median prediction, $w_0$ denotes the weight of the
+    median prediction, $IS_{\alpha_k}(F, y)$ denotes the interval score for the
+    $1 - \alpha$ prediction interval and $w_k$ is the according weight.
+    The WIS is calculated for a set of (central) PIs and the predictive median.
+    The weights are an optional parameter and default weight is the canonical
+    weight $w_k = \frac{2}{\alpha_k}$ and $w_0 = 0.5$.
+    For these weights, it holds that:
 
-    where $m$ denotes the median prediction, $w_0$ denotes the weight of the median prediction,
-    $IS_{\alpha_k}(F, y)$ denotes the interval score for the $1 - \alpha$ prediction interval and
-    $w_k$ is the according weight. The WIS is calculated for a set of (central) PIs and the predictive
-    median. The weights are an optional parameter and default weight
-    is the canonical weight $w_k = \frac{2}{\alpha_k}$ and $w_0 = 0.5$. Using the canonical weights, the WIS
-    can be used to approximate the CRPS.
+    $$
+    \text{WIS}_{\alpha_{0:K}}(F, y) \approx \text{CRPS}(F, y).
+    $$
 
     Parameters
     ----------
-    observations: ArrayLike
-        The observed values.
-    median: Array
-        The median prediction
-    lower: Array
-        The predicted lower bound of the prediction interval.
-    upper: Array
-        The predicted upper bound of the prediction interval.
-    alpha: Array
-        The 1 - alpha level for the prediction interval.
-    weight_median: float
-        The weight for the median prediction.
-    weight_alpha: Array
-        The weights for the PI.
-    axis: int
-        The axis corresponding to the ensemble. Default is the last axis.
-    backend: str
+    obs:
+        The observations as a scalar or array of shape `(...,)`.
+    median:
+        The predicted median of the distribution as a scalar or array of shape `(...,)`.
+    lower:
+        The predicted lower bound of the PI. If `alpha` is an array of shape `(K,)`,
+        `lower` must have shape `(...,K)`.
+    upper:
+        The predicted upper bound of the PI. If `alpha` is an array of shape `(K,)`,
+        `upper` must have shape `(...,K)`.
+    alpha:
+        The 1 - alpha level for the prediction intervals as an array of shape `(K,)`.
+    w_median:
+        The weight for the median prediction. Defaults to 0.5.
+    w_alpha:
+        The weights for the PI. Defaults to `2/alpha`.
+    backend:
         The name of the backend used for computations. Defaults to 'numba' if available, else 'numpy'.
 
     Returns
     -------
-    score: Array
-        An array of interval scores for each observation, which should be averaged to get meaningful values.
+    score:
+        An array of interval scores with the same shape as `obs`.
     """
-    if weight_alpha is None:
-        weight_alpha = alpha / 2
-    if weight_median is None:
-        weight_median = 0.5
+    if w_median is None:
+        w_median = 0.5
+    if w_alpha is None:
+        w_alpha = alpha / 2
 
     B = backends.active if backend is None else backends[backend]
-    observations, median, lower, upper, alpha, weight_alpha, weight_median = map(
+    args = map(
         B.asarray,
-        (observations, median, lower, upper, alpha, weight_alpha, weight_median),
+        (obs, median, lower, upper, alpha, w_median, w_alpha),
     )
 
-    if axis != -1:
-        lower = B.moveaxis(lower, axis, -1)
-        upper = B.moveaxis(upper, axis, -1)
-
     if B.name == "numba":
-        return interval._weighted_interval_score_gufunc(
-            observations, median, lower, upper, alpha, weight_median, weight_alpha
-        )
+        return interval._weighted_interval_score_gufunc(*args)
 
-    return interval._weighted_interval_score(
-        observations,
-        median,
-        lower,
-        upper,
-        alpha,
-        weight_median,
-        weight_alpha,
-        backend=backend,
-    )
+    return interval.weighted_interval_score(*args, backend=backend)
diff --git a/scoringrules/backend/base.py b/scoringrules/backend/base.py
index 350e215..10d6200 100644
--- a/scoringrules/backend/base.py
+++ b/scoringrules/backend/base.py
@@ -68,7 +68,10 @@ def quantile(
 
     @abc.abstractmethod
     def max(
-        self, x: "Array", axis: int | tuple[int, ...] | None, keepdims: bool = False
+        self,
+        x: "Array",
+        axis: int | tuple[int, ...] | None,
+        keepdims: bool = False,
     ) -> "Array":
         """Return the maximum value of an input array ``x``."""
 
@@ -111,7 +114,11 @@ def unique_values(self, x: "Array", /) -> "Array":
 
     @abc.abstractmethod
     def concat(
-        self, arrays: tuple["Array", ...] | list["Array"], /, *, axis: int | None = 0
+        self,
+        arrays: tuple["Array", ...] | list["Array"],
+        /,
+        *,
+        axis: int | None = 0,
     ) -> "Array":
         """Join a sequence of arrays along an existing axis."""
 
diff --git a/scoringrules/backend/jax.py b/scoringrules/backend/jax.py
index 8dcdd2b..80332cb 100644
--- a/scoringrules/backend/jax.py
+++ b/scoringrules/backend/jax.py
@@ -69,7 +69,10 @@ def quantile(
         return jnp.quantile(x, q, axis=axis, keepdims=keepdims)
 
     def max(
-        self, x: "Array", axis: int | tuple[int, ...] | None, keepdims: bool = False
+        self,
+        x: "Array",
+        axis: int | tuple[int, ...] | None,
+        keepdims: bool = False,
     ) -> "Array":
         return jnp.max(x, axis=axis, keepdims=keepdims)
 
@@ -107,7 +110,11 @@ def unique_values(self, x: "Array") -> "Array":
         return jnp.unique(x)
 
     def concat(
-        self, arrays: tuple["Array", ...] | list["Array"], /, *, axis: int | None = 0
+        self,
+        arrays: tuple["Array", ...] | list["Array"],
+        /,
+        *,
+        axis: int | None = 0,
     ) -> "Array":
         return jnp.concatenate(arrays, axis=axis)
 
diff --git a/scoringrules/backend/numpy.py b/scoringrules/backend/numpy.py
index 79393cc..1fce50f 100644
--- a/scoringrules/backend/numpy.py
+++ b/scoringrules/backend/numpy.py
@@ -69,7 +69,10 @@ def quantile(
         return np.quantile(x, q, axis=axis, keepdims=keepdims)
 
     def max(
-        self, x: "NDArray", axis: int | tuple[int, ...] | None, keepdims: bool = False
+        self,
+        x: "NDArray",
+        axis: int | tuple[int, ...] | None,
+        keepdims: bool = False,
     ) -> "NDArray":
         return np.max(x, axis=axis, keepdims=keepdims)
 
@@ -124,7 +127,11 @@ def squeeze(
         return np.squeeze(x, axis=axis)
 
     def stack(
-        self, arrays: tuple["NDArray", ...] | list["NDArray"], /, *, axis: int = 0
+        self,
+        arrays: tuple["NDArray", ...] | list["NDArray"],
+        /,
+        *,
+        axis: int = 0,
     ) -> "NDArray":
         return np.stack(arrays, axis=axis)
 
diff --git a/scoringrules/backend/tensorflow.py b/scoringrules/backend/tensorflow.py
index daf6323..9143607 100644
--- a/scoringrules/backend/tensorflow.py
+++ b/scoringrules/backend/tensorflow.py
@@ -76,7 +76,10 @@ def quantile(
         raise NotImplementedError
 
     def max(
-        self, x: "Tensor", axis: int | tuple[int, ...] | None, keepdims: bool = False
+        self,
+        x: "Tensor",
+        axis: int | tuple[int, ...] | None,
+        keepdims: bool = False,
     ) -> "Tensor":
         return tf.math.reduce_max(x, axis=axis, keepdims=keepdims)
 
diff --git a/scoringrules/backend/torch.py b/scoringrules/backend/torch.py
index 933271d..3c7528b 100644
--- a/scoringrules/backend/torch.py
+++ b/scoringrules/backend/torch.py
@@ -70,7 +70,10 @@ def quantile(
         return torch.quantile(x, q, dim=axis, keepdim=keepdims)
 
     def max(
-        self, x: "Tensor", axis: int | tuple[int, ...] | None, keepdims: bool = False
+        self,
+        x: "Tensor",
+        axis: int | tuple[int, ...] | None,
+        keepdims: bool = False,
     ) -> "Tensor":
         return torch.max(x, axis=axis, keepdim=keepdims)[0]
 
@@ -105,7 +108,11 @@ def unique_values(self, x: "Tensor", /) -> "Tensor":
         return torch.unique(x)
 
     def concat(
-        self, arrays: tuple["Tensor", ...] | list["Tensor"], /, *, axis: int | None = 0
+        self,
+        arrays: tuple["Tensor", ...] | list["Tensor"],
+        /,
+        *,
+        axis: int | None = 0,
     ) -> "Tensor":
         return torch.concat(arrays, axis=axis)
 
diff --git a/scoringrules/core/crps/_closed.py b/scoringrules/core/crps/_closed.py
index e541b63..1cddcde 100644
--- a/scoringrules/core/crps/_closed.py
+++ b/scoringrules/core/crps/_closed.py
@@ -111,7 +111,8 @@ def _inner(params):
 
         # option 1: in a loop
         s = B.stack(
-            [_inner(params) for params in zip(obs, n, prob, strict=True)], axis=-1
+            [_inner(params) for params in zip(obs, n, prob, strict=True)],
+            axis=-1,
         )
 
         # option 2: apply_along_axis (does not work with JAX)
@@ -526,7 +527,10 @@ def laplace(
 
 
 def logistic(
-    obs: "ArrayLike", mu: "ArrayLike", sigma: "ArrayLike", backend: "Backend" = None
+    obs: "ArrayLike",
+    mu: "ArrayLike",
+    sigma: "ArrayLike",
+    backend: "Backend" = None,
 ) -> "Array":
     """Compute the CRPS for the normal distribution."""
     B = backends.active if backend is None else backends[backend]
@@ -648,7 +652,10 @@ def negbinom(
 
 
 def normal(
-    obs: "ArrayLike", mu: "ArrayLike", sigma: "ArrayLike", backend: "Backend" = None
+    obs: "ArrayLike",
+    mu: "ArrayLike",
+    sigma: "ArrayLike",
+    backend: "Backend" = None,
 ) -> "Array":
     """Compute the CRPS for the logistic distribution."""
     B = backends.active if backend is None else backends[backend]
diff --git a/scoringrules/core/interval/__init__.py b/scoringrules/core/interval/__init__.py
index 3382b26..6f3ef6c 100644
--- a/scoringrules/core/interval/__init__.py
+++ b/scoringrules/core/interval/__init__.py
@@ -1,14 +1,12 @@
 try:
-    from ._gufunc import _interval_score_gufunc, _weighted_interval_score_gufunc
+    from ._gufunc import _weighted_interval_score_gufunc
 except ImportError:
-    _interval_score_gufunc = None
     _weighted_interval_score_gufunc = None
 
-from ._score import _interval_score, _weighted_interval_score
+from ._score import interval_score, weighted_interval_score
 
 __all__ = [
-    "_interval_score",
-    "_weighted_interval_score",
-    "_interval_score_gufunc",
+    "interval_score",
+    "weighted_interval_score",
     "_weighted_interval_score_gufunc",
 ]
diff --git a/scoringrules/core/interval/_gufunc.py b/scoringrules/core/interval/_gufunc.py
index 52e96fa..0c3b62d 100644
--- a/scoringrules/core/interval/_gufunc.py
+++ b/scoringrules/core/interval/_gufunc.py
@@ -1,25 +1,5 @@
 import numpy as np
-from numba import guvectorize, vectorize
-
-
-@vectorize(
-    [
-        "float64(float64, float64, float64, float64)",
-        "float32(float32, float32, float32, float32)",
-    ]
-)
-def _interval_score_gufunc(
-    obs: np.ndarray,
-    lower: np.ndarray,
-    upper: np.ndarray,
-    alpha: np.ndarray,
-):
-    """Interval score or Winkler score."""
-    return (
-        (upper - lower)
-        + (obs < lower) * (2 / alpha) * (lower - obs)
-        + (obs > upper) * (2 / alpha) * (obs - upper)
-    )
+from numba import guvectorize
 
 
 @guvectorize(
diff --git a/scoringrules/core/interval/_score.py b/scoringrules/core/interval/_score.py
index 899e1cb..92cfc4e 100644
--- a/scoringrules/core/interval/_score.py
+++ b/scoringrules/core/interval/_score.py
@@ -6,41 +6,36 @@
     from scoringrules.core.typing import Array, Backend
 
 
-def _interval_score(
+def interval_score(
     obs: "Array",
     lower: "Array",
     upper: "Array",
     alpha: "Array",
-    backend: "Backend" = None,
 ) -> "Array":
     """Winkler or Interval Score for prediction interval PI[lower, upper] and observations."""
-    # We don't need the backend here
     width = upper - lower
     above = obs > upper
     below = obs < lower
-    W = width + (
-        below * (2 / alpha) * (lower - obs) + above * (2 / alpha) * (obs - upper)
-    )
+    W = width + below * (2 / alpha) * (lower - obs)
+    W += above * (2 / alpha) * (obs - upper)
     return W
 
 
-def _weighted_interval_score(
+def weighted_interval_score(
     obs: "Array",
     median: "Array",
     lower: "Array",
     upper: "Array",
     alpha: "Array",
-    weight_median: "Array",
-    weight_alpha: "Array",
+    w_median: "Array",
+    w_alpha: "Array",
     backend: "Backend" = None,
 ) -> "Array":
     """Weighted Interval Score for prediction interval PI[lower, upper]."""
     B = backends.active if backend is None else backends[backend]
-    K = weight_alpha.shape[0]
-    IS = _interval_score(obs, lower, upper, alpha)
-    WIS = (
-        B.sum(IS * B.expand_dims(weight_alpha, axis=-2), axis=-1)
-        + weight_median * median
-    )
+    K = w_alpha.shape[0]
+    IS = interval_score(obs[..., None], lower, upper, alpha)
+    WIS = B.sum(IS * w_alpha, axis=-1)
+    WIS += w_median * median
     WIS /= K + 1 / 2
-    return WIS
+    return B.squeeze(WIS)
diff --git a/scoringrules/core/kernels/_approx.py b/scoringrules/core/kernels/_approx.py
index 8e86fa8..528d096 100644
--- a/scoringrules/core/kernels/_approx.py
+++ b/scoringrules/core/kernels/_approx.py
@@ -33,7 +33,8 @@ def ensemble_uv(
     M: int = fct.shape[-1]
     e_1 = B.sum(gauss_kern_uv(obs[..., None], fct, backend=backend), axis=-1) / M
     e_2 = B.sum(
-        gauss_kern_uv(fct[..., None], fct[..., None, :], backend=backend), axis=(-1, -2)
+        gauss_kern_uv(fct[..., None], fct[..., None, :], backend=backend),
+        axis=(-1, -2),
     ) / (M**2)
     e_3 = gauss_kern_uv(obs, obs)
 
@@ -161,7 +162,9 @@ def vr_ensemble_uv(
     )
     e_2 = B.sum(
         gauss_kern_uv(
-            B.expand_dims(fct, axis=-1), B.expand_dims(fct, axis=-2), backend=backend
+            B.expand_dims(fct, axis=-1),
+            B.expand_dims(fct, axis=-2),
+            backend=backend,
         )
         * (B.expand_dims(fw, axis=-1) * B.expand_dims(fw, axis=-2)),
         axis=(-1, -2),
diff --git a/scoringrules/core/logarithmic.py b/scoringrules/core/logarithmic.py
index d95f0a9..cd4d7dc 100644
--- a/scoringrules/core/logarithmic.py
+++ b/scoringrules/core/logarithmic.py
@@ -149,7 +149,10 @@ def twopexponential(
 
 
 def gamma(
-    obs: "ArrayLike", shape: "ArrayLike", rate: "ArrayLike", backend: "Backend" = None
+    obs: "ArrayLike",
+    shape: "ArrayLike",
+    rate: "ArrayLike",
+    backend: "Backend" = None,
 ) -> "Array":
     """Compute the logarithmic score for the gamma distribution."""
     B = backends.active if backend is None else backends[backend]
diff --git a/scoringrules/core/stats.py b/scoringrules/core/stats.py
index 6220ddd..80523a4 100644
--- a/scoringrules/core/stats.py
+++ b/scoringrules/core/stats.py
@@ -55,7 +55,10 @@ def _exp_cdf(x: "ArrayLike", rate: "ArrayLike", backend: "Backend" = None) -> "A
 
 
 def _gamma_pdf(
-    x: "ArrayLike", shape: "ArrayLike", rate: "ArrayLike", backend: "Backend" = None
+    x: "ArrayLike",
+    shape: "ArrayLike",
+    rate: "ArrayLike",
+    backend: "Backend" = None,
 ) -> "Array":
     """Probability density function for the gamma distribution."""
     B = backends.active if backend is None else backends[backend]
@@ -64,7 +67,10 @@ def _gamma_pdf(
 
 
 def _gamma_cdf(
-    x: "ArrayLike", shape: "ArrayLike", rate: "ArrayLike", backend: "Backend" = None
+    x: "ArrayLike",
+    shape: "ArrayLike",
+    rate: "ArrayLike",
+    backend: "Backend" = None,
 ) -> "Array":
     """Cumulative distribution function for the gamma distribution."""
     B = backends.active if backend is None else backends[backend]
diff --git a/scoringrules/visualization/reliability.py b/scoringrules/visualization/reliability.py
index b8d0c2b..5dcf2f0 100644
--- a/scoringrules/visualization/reliability.py
+++ b/scoringrules/visualization/reliability.py
@@ -128,7 +128,11 @@ def _uncertainty_band(x, cep, n_bootstrap=100, bandtype="consistency", alpha=0.0
         _x, _y, _cep = corp_reliability(_y, _x)
         res.append(
             interp1d(
-                _x, _cep, fill_value="nan", bounds_error=False, assume_sorted=True
+                _x,
+                _cep,
+                fill_value="nan",
+                bounds_error=False,
+                assume_sorted=True,
             )(x)
         )
     res = np.array(res)
diff --git a/tests/test_interval.py b/tests/test_interval.py
index 79df546..c59a314 100644
--- a/tests/test_interval.py
+++ b/tests/test_interval.py
@@ -7,17 +7,65 @@
 
 N = 100
 
-## We use Bracher et al (2021) Eq. (3) to test the WIS
-
 
 @pytest.mark.parametrize("backend", BACKENDS)
 def test_interval_score(backend):
+    # basic functionality
+    _ = sr.interval_score(0.1, 0.0, 0.4, 0.5)
+
+    # shapes
+    res = sr.interval_score(
+        obs=np.array([0.1, 0.2, 0.3]),
+        lower=np.array([0.0, 0.1, 0.2]),
+        upper=np.array([0.4, 0.3, 0.5]),
+        alpha=0.5,
+        backend=backend,
+    )
+    assert res.shape == (3,)
+
+    res = sr.interval_score(
+        obs=np.random.uniform(size=(10,)),
+        lower=np.ones((10, 5)) * 0.2,
+        upper=np.ones((10, 5)) * 0.8,
+        alpha=np.linspace(0.1, 0.9, 5),
+        backend=backend,
+    )
+    assert res.shape == (10, 5)
+
+    # raise ValueError
+    with pytest.raises(ValueError):
+        _ = sr.interval_score(
+            obs=np.random.uniform(size=(10,)),
+            lower=np.ones((10, 5)) * 0.2,
+            upper=np.ones((10, 4)) * 0.8,
+            alpha=np.linspace(0.1, 0.9, 5),
+            backend=backend,
+        )
+
+    with pytest.raises(ValueError):
+        _ = sr.interval_score(
+            obs=np.random.uniform(size=(10,)),
+            lower=np.ones((10, 5)) * 0.2,
+            upper=np.ones((10, 5)) * 0.8,
+            alpha=np.linspace(0.1, 0.9, 4),
+            backend=backend,
+        )
+
+    # correctness
+    res = sr.interval_score(0.1, 0.0, 0.4, 0.5, backend=backend)
+    np.isclose(res, 0.4)
+
+
+## We use Bracher et al (2021) Eq. (3) to test the WIS
+@pytest.mark.parametrize("backend", BACKENDS)
+def test_weighted_interval_score(backend):
     obs = np.zeros(N)
     alpha = np.linspace(0.01, 0.99, 99)
     upper = st.norm(0, 1).ppf(np.tile(1 - alpha / 2, (N, 1)))
     lower = st.norm(0, 1).ppf(np.tile(alpha / 2, (N, 1)))
 
-    WIS = sr.weighted_interval_score(obs, obs, lower, upper, alpha)
-    CRPS = sr.crps_normal(obs, 0, 1)
+    WIS = sr.weighted_interval_score(obs, obs, lower, upper, alpha, backend=backend)
+    CRPS = sr.crps_normal(obs, 0, 1, backend=backend)
+    WIS, CRPS = map(np.asarray, (WIS, CRPS))
 
     assert np.all(1 - WIS / CRPS <= 0.001 * CRPS)