Merge pull request #16 from Amps-GPU/more_performance

More changes for performance
Amps-GPU · Jul 28, 2024 · f4e36bd · f4e36bd
2 parents 6b1f63a + 8abb2c5
commit f4e36bd
Show file tree

Hide file tree

Showing 9 changed files with 187 additions and 107 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 [![CI Lint](https://github.com/Amps-GPU/BG-Trees/actions/workflows/ci_lint.yml/badge.svg)](https://github.com/Amps-GPU/BG-Trees/actions/workflows/ci_lint.yml)
 [![CI Test](https://github.com/Amps-GPU/BG-Trees/actions/workflows/ci_test_gpu.yml/badge.svg)](https://github.com/Amps-GPU/BG-Trees/actions/workflows/ci_test_gpu.yml)
-[![Coverage](https://img.shields.io/badge/Coverage-76%25-yellow?labelColor=2a2f35)](https://github.com/Amps-GPU/BG-Trees/actions)
+[![Coverage](https://img.shields.io/badge/Coverage-74%25-yellow?labelColor=2a2f35)](https://github.com/Amps-GPU/BG-Trees/actions)
 
 ## Installation
 

diff --git a/benchmarks/timing_benchmark.py b/benchmarks/timing_benchmark.py
@@ -20,7 +20,8 @@
 if __name__ == "__main__":
     parser = ArgumentParser()
     parser.add_argument(
-        "data_file", help="A npz data file with the right information (use `create_array` to run this benchmark)"
+        "data_file",
+        help="A npz data file with the right information (use `create_array` to run this benchmark)",
     )
     parser.add_argument(
         "-n",
@@ -30,8 +31,11 @@
         type=int,
         default=[10, 100, 1000],
     )
-    parser.add_argument("-a", "--average", help="Run <average> times and take the average", type=int, default=1)
+    parser.add_argument(
+        "-a", "--average", help="Run <average> times and take the average", type=int, default=1
+    )
     parser.add_argument("-o", "--output", help="Output file for results as <n> <events>", type=str)
+    parser.add_argument("--profile", action="store_true")
     args = parser.parse_args()
 
     load_info = np.load(args.data_file)
@@ -57,11 +61,20 @@
 
     res_per_n = {}
 
-    for nev in list_of_n:
+    # Run a bit just to activate the JIT compilation
+    if not settings.executing_eagerly():
+        _ = another_j(ff_moms[:10, 1:], ff_pols[:10, 1:], put_propagator=False, verbose=False)
+
+    if args.profile:
+        import tensorflow as tf
 
-        # Run a bit just to activate the JIT compilation
-        if not settings.executing_eagerly():
-            _ = another_j(ff_moms[:10, 1:], ff_pols[:10, 1:], put_propagator=False, verbose=False)
+        logdir_path = "profiling_here"
+        options = tf.profiler.experimental.ProfilerOptions(
+            host_tracer_level=3, python_tracer_level=1, device_tracer_level=1
+        )
+        tf.profiler.experimental.start(logdir_path, options=options)
+
+    for nev in list_of_n:
 
         timing_raw = 0
 
@@ -86,8 +99,11 @@
 
         print(f"n = {nev} took {timing:.5}s")
 
-    #         finres = np.concatenate([i.values.numpy() for i in total_final_results])
-    #         np.testing.assert_allclose(finres, load_info["target"][:nev])
+#         finres = np.concatenate([i.values.numpy() for i in total_final_results])
+#         np.testing.assert_allclose(finres, load_info["target"][:nev])
+
+    if args.profile:
+        tf.profiler.experimental.stop()
 
     if args.output is not None:
         res_as_str = "\n".join([f"{i} {j}" for i, j in res_per_n.items()])

diff --git a/bgtrees/currents.py b/bgtrees/currents.py
@@ -6,7 +6,7 @@
 from bgtrees.finite_gpufields import operations as op
 from bgtrees.finite_gpufields.finite_fields_tf import FiniteField
 
-from .metric_and_verticies import V3g, V4g, η
+from .metric_and_verticies import V3g, V4g, new_V3g, η
 from .settings import settings
 
 
@@ -77,7 +77,7 @@ def _vert_3_gluon(slice_left, slice_right):
     """Computes the 3g vertex for the two slices coming in."""
     moms_sl = tf.reduce_sum(slice_left, axis=1)
     moms_sr = tf.reduce_sum(slice_right, axis=1)
-    return V3g(moms_sl, moms_sr, einsum=op.ff_tensor_product)
+    return new_V3g(moms_sl, moms_sr)
 
 
 @tf.function(reduce_retracing=True)
@@ -107,7 +107,8 @@ def _contract_v4_current(vertex, jnu, jo, jrho):
 
     tmp_1 = op.ff_dot_product_single_batch(jrho, v4, rank_x=2, rank_y=2)  # rp, pN -> rN
     tmp_1 = tmp_1.reshape_ff((-1, D, D, D))  # rN -> ronm
-    tmp_1 = op.ff_index_permutation("ronm->rmno", tmp_1)
+    tmp_1 = tmp_1.transpose_ff((0, 3, 2, 1))
+    #     tmp_1 = op.ff_index_permutation("ronm->rmno", tmp_1)
 
     tmp_2 = op.ff_dot_product_tris(tmp_1, jo, rank_x=4, rank_y=2)  # rmno, ro -> rmn
     return op.ff_dot_product(tmp_2, jnu, rank_x=3, rank_y=2)  # rmn, rn -> rm

diff --git a/bgtrees/finite_gpufields/finite_fields_tf.py b/bgtrees/finite_gpufields/finite_fields_tf.py
@@ -139,6 +139,10 @@ def reshape_ff(self, new_shape):
         new_values = tf.reshape(self.n, new_shape)
         return self.__class__(new_values, self.p)
 
+    def transpose_ff(self, permutation):
+        new_values = tf.transpose(self.n, perm=permutation)
+        return self.__class__(new_values, self.p)
+
     # Mirror version
     def __radd__(self, b):
         return self + b
@@ -218,99 +222,69 @@ def finite_field_squeeze(input, axis, name=None):
     return FiniteField(new_n, input.p)
 
 
-######
-def _finite_field_reduce(red_op, input_tensor, axis=None, keepdims=False):
-    """Auxiliar function to reduce Finite Field containers"""
-    if axis is None:
-        all_axes = list(range(len(input_tensor.shape)))
-        return _finite_field_reduce(red_op, input_tensor, axis=all_axes, keepdims=keepdims)
-
-    # Hopefully the exception block is compiled away upon first pass...
-    try:
-        res = input_tensor
-        for ax in axis:
-            res = _finite_field_reduce(red_op, res, axis=ax, keepdims=True)
-        if not keepdims:
-            res = tf.squeeze(res, axis=axis)
-        return res
-    except TypeError:
-        pass
-
-    @tf.py_function(Tout=settings.dtype)
-    def reduce_me(itensor):
-        # First separate the FF in the axis that we want to sum over
-        unstacked_ff = tf.unstack(itensor, axis=axis)
-        summed_ff = functools.reduce(red_op, unstacked_ff)
-        # Add a dummy dimension if the user asked for it
-        if keepdims:
-            summed_ff = tf.expand_dims(summed_ff, axis)
-        return summed_ff.n
-
-    return FiniteField(reduce_me(input_tensor), input_tensor.p)
-
-
-@experimental.dispatch_for_api(tf.reduce_sum, {"input_tensor": FiniteField})
-def finite_field_reduce_sum(input_tensor, axis=None, keepdims=False, name=None):
-    """Override the reduce_sum operation for a FiniteField container"""
-    return _finite_field_reduce(operator.add, input_tensor, axis=axis, keepdims=keepdims)
+def _ff_reduce_internal(accumulated, next_element, operation=operator.add, p=settings.tf_p):
+    """Apply the operation ``operation`` inmediately taking the %
+    Acts on integers
+    """
+    return tf.math.floormod(operation(accumulated, next_element), p)
 
 
-@experimental.dispatch_for_api(tf.reduce_prod, {"input_tensor": FiniteField})
-def finite_field_reduce_prod(input_tensor, axis=None, keepdims=False, name=None):
-    """Override the reduce_sum operation for a FiniteField container"""
-    return _finite_field_reduce(operator.mul, input_tensor, axis=axis, keepdims=keepdims)
-
+@functools.lru_cache
+def _dispatch_ff_reduction(operation, p):
+    """Dispatchs the _ff_reduce_internal with the right settings"""
+    return tf.function(functools.partial(_ff_reduce_internal, operation=operation, p=p))
 
-def oinsum(eq, *arrays):
-    """A ``einsum`` implementation for ``numpy`` object arrays."""
-    lhs, output = eq.split("->")
-    inputs = lhs.split(",")
 
-    sizes = {}
-    for term, array in zip(inputs, arrays):
-        for k, d in zip(term, array.shape):
-            sizes[k] = d
+@tf.function
+def _ff_reduce_single_axis(input_tensor, axis, operation=operator.add, p=settings.p):
+    """Utilize the numpy experimental API and foldl to apply a reduction to the input tensor.
+    Acts on integer, the calling function should then make it into a FF.
+    """
+    reduce_fn = _dispatch_ff_reduction(operation, p)
+    operate_on = tf.experimental.numpy.moveaxis(input_tensor, axis, 0)
+    return tf.foldl(reduce_fn, operate_on)
 
-    out_size = tuple(sizes[k] for k in output)
-    out = np.empty(out_size, dtype=object)
 
-    inner = [k for k in sizes if k not in output]
-    inner_size = [sizes[k] for k in inner]
+@tf.function
+def _ff_reduce(input_tensor, axis, keepdims, operation):
+    raw_input = input_tensor.n
 
-    for coo_o in np.ndindex(*out_size):
-        coord = dict(zip(output, coo_o))
+    if axis is None:
+        ret = raw_input
+        for _ in range(raw_input.ndim):
+            ret = _ff_reduce_single_axis(ret, 0, operation=operation)
 
-        def gen_inner_sum():
-            for coo_i in np.ndindex(*inner_size):
-                coord.update(dict(zip(inner, coo_i)))
+        if keepdims:
+            for _ in range(raw_input.ndim):
+                ret = tf.expand_dims(ret)
 
-                locs = []
-                for term in inputs:
-                    locs.append(tuple(coord[k] for k in term))
+    elif isinstance(axis, int):
+        ret = _ff_reduce_single_axis(raw_input, axis, operation=operation)
 
-                elements = []
-                for array, loc in zip(arrays, locs):
-                    elements.append(array[loc])
+        if keepdims:
+            ret = tf.expand_dims(ret, axis)
 
-                yield functools.reduce(operator.mul, elements)
+    else:
+        # A tuple has been received
+        ret = input_tensor
+        for ax in axis:
+            ret = _ff_reduce(ret, ax, True, operation)
+        ret = ret.n
 
-        tmp = functools.reduce(operator.add, gen_inner_sum())
-        out[coo_o] = tmp
+        # Now, if keepdims was False, remove the dimensions
+        if not keepdims:
+            ret = tf.squeeze(ret, axis)
 
-    # if the output is made of finite fields, take them out
-    if isinstance(tmp, FiniteField) and len(out_size) == 0:
-        out = tmp
-    elif isinstance(tmp, FiniteField):
-        p = tmp.p
+    return FiniteField(ret, input_tensor.p)
 
-        def unff(x):
-            if isinstance(x, FiniteField):
-                return x.n.numpy()
-            return x
 
-        vunff = np.vectorize(unff)
+@experimental.dispatch_for_api(tf.reduce_sum, {"input_tensor": FiniteField})
+def finite_field_reduce_sum(input_tensor, axis=None, keepdims=False, name=None):
+    """Override the reduce_sum operation for a FiniteField container"""
+    return _ff_reduce(input_tensor, axis, keepdims, operation=operator.add)
 
-        new_out = vunff(out)
-        out = FiniteField(new_out, p)
 
-    return out
+@experimental.dispatch_for_api(tf.reduce_prod, {"input_tensor": FiniteField})
+def finite_field_reduce_prod(input_tensor, axis=None, keepdims=False, name=None):
+    """Override the reduce_sum operation for a FiniteField container"""
+    return _ff_reduce(input_tensor, axis, keepdims, operation=operator.mul)
diff --git a/bgtrees/finite_gpufields/operations.py b/bgtrees/finite_gpufields/operations.py
@@ -22,22 +22,11 @@ def ff_einsum_generic(einstr, *args):
         ff_tensor_product
         ff_index_permutation
     """
-    if len(args) == 1:
-        return ff_index_permutation(einstr, *args)
-    elif len(args) == 2:
+    if len(args) == 2:
         return ff_tensor_product(einstr, *args)
     raise NotImplementedError(f"Automatic understanding of contractions not implemented for {einstr}")
 
 
-@tf.function(reduce_retracing=True)
-def ff_index_permutation(einstr, x):
-    """Uses tf.einsum to permute the index of the tensor x
-    Since this is simply an index permutation, it goes transparently to tf.einsum
-    """
-    ret = tf.einsum(einstr, x.values)
-    return FiniteField(ret, x.p)
-
-
 @tf.function(reduce_retracing=True)
 def ff_dot_product(x, y, rank_x=None, rank_y=None):
     """Perform a dot product between two batched Finite Fields

diff --git a/bgtrees/metric_and_verticies.py b/bgtrees/metric_and_verticies.py
@@ -2,13 +2,19 @@
 import numpy
 import tensorflow as tf
 
+from .finite_gpufields.finite_fields_tf import FiniteField
 from .settings import settings
 from .tools import gpu_constant, gpu_function
 
 Gamma = γμ = numpy.block([[numpy.zeros((4, 2, 2)), Pauli_bar], [Pauli, numpy.zeros((4, 2, 2))]])
 Gamma5 = γ5 = numpy.block([[numpy.identity(2), numpy.zeros((2, 2))], [numpy.zeros((2, 2)), -numpy.identity(2)]])
 
 
+@gpu_constant
+def diag_mink(D):
+    return numpy.array([1] + [-1] * (D - 1)).astype(settings.dtype)
+
+
 @gpu_constant
 def MinkowskiMetric(D):
     """D-dimensional Minkowski metric in the mostly negative convention."""
@@ -39,3 +45,23 @@ def V3g(lp1, lp2, einsum=numpy.einsum):
     return (
         einsum("mn,rl->rlmn", mm, (lp1 - lp2)) + 2 * einsum("nl,rm->rlmn", mm, lp2) - 2 * einsum("lm,rn->rlmn", mm, lp1)
     )
+
+
+@tf.function(reduce_retracing=True)
+def new_V3g(lp1, lp2):
+    """3-gluon vertex, upper indices μνρ, D-dimensional. Reduce tensor products."""
+    D = lp1.shape[1]
+    if D is None:
+        D = settings.D
+
+    mm = diag_mink(D)
+    r1 = tf.tensordot(lp1.n, mm, 0)
+    r2 = tf.tensordot(lp2.n, mm, 0)
+
+    r1 = tf.linalg.diag(r1)
+    r2 = tf.linalg.diag(r2)
+
+    a1 = FiniteField(r1, lp1.p)
+    a2 = FiniteField(r2, lp1.p)
+
+    return (a1 - a2) + 2.0 * a2.transpose_ff((0, 3, 1, 2)) - 2.0 * a1.transpose_ff((0, 2, 3, 1))
diff --git a/bgtrees/settings.py b/bgtrees/settings.py
@@ -9,6 +9,7 @@ class Settings:
     use_gpu: bool = False
     D: int = 4
     dtype: type = np.int64
+    p: int = 2**31 - 19
 
     # Tensorflow settings
     def run_tf_eagerly(self):
@@ -17,6 +18,10 @@ def run_tf_eagerly(self):
     def executing_eagerly(self):
         return tf.executing_eagerly()
 
+    @property
+    def tf_p(self):
+        return tf.cast(self.p, dtype=settings.dtype)
+
 
 settings = Settings()