Skip to content

Commit

Permalink
New SCC revector transformation with sequential kernels and new corre…
Browse files Browse the repository at this point in the history
…sponding pipelines
  • Loading branch information
MichaelSt98 committed Jan 30, 2025
1 parent 48c5cbf commit f3591ec
Show file tree
Hide file tree
Showing 5 changed files with 718 additions and 125 deletions.
2 changes: 1 addition & 1 deletion loki/batch/tests/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2912,7 +2912,7 @@ def test_pipeline_config_compose(config):
assert type(pipeline.transformations[2]).__name__ == 'SCCBaseTransformation'
assert type(pipeline.transformations[3]).__name__ == 'SCCDevectorTransformation'
assert type(pipeline.transformations[4]).__name__ == 'SCCDemoteTransformation'
assert type(pipeline.transformations[5]).__name__ == 'SCCRevectorTransformation'
assert type(pipeline.transformations[5]).__name__ == 'SCCVecRevectorTransformation'
assert type(pipeline.transformations[6]).__name__ == 'SCCAnnotateTransformation'
assert type(pipeline.transformations[7]).__name__ == 'ModuleWrapTransformation'

Expand Down
210 changes: 187 additions & 23 deletions loki/transformations/single_column/scc.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,28 @@
from loki.transformations.single_column.annotate import SCCAnnotateTransformation
from loki.transformations.single_column.hoist import SCCHoistTemporaryArraysTransformation
from loki.transformations.single_column.vector import (
SCCDevectorTransformation, SCCDemoteTransformation, SCCRevectorTransformation
SCCDevectorTransformation, SCCDemoteTransformation, SCCRevectorTransformation,
SCCVecRevectorTransformation, SCCSeqRevectorTransformation
)
from loki.transformations.single_column.vertical import SCCFuseVerticalLoops

__all__ = [
'SCCVectorPipeline', 'SCCHoistPipeline', 'SCCStackPipeline', 'SCCRawStackPipeline'
'SCCVectorPipeline', 'SCCVVectorPipeline', 'SCCSVectorPipeline',
'SCCHoistPipeline', 'SCCVHoistPipeline', 'SCCSHoistPipeline',
'SCCStackPipeline', 'SCCVStackPipeline', 'SCCSStackPipeline',
'SCCRawStackPipeline',
]


SCCVVectorPipeline = partial(
Pipeline, classes=(
SCCFuseVerticalLoops,
SCCBaseTransformation,
SCCDevectorTransformation,
SCCDemoteTransformation,
SCCVecRevectorTransformation,
SCCAnnotateTransformation
)
)
"""
The basic Single Column Coalesced (SCC) transformation with
vector-level kernel parallelism.
Expand All @@ -51,7 +64,7 @@
2. :any:`SCCDevectorTransformation` - Remove horizontal vector loops.
3. :any:`SCCDemoteTransformation` - Demote local temporary array
variables where appropriate.
4. :any:`SCCRevectorTransformation` - Re-insert the vecotr loops outermost,
4. :any:`SCCVecRevectorTransformation` - Re-insert the vecotr loops outermost,
according to identified vector sections.
5. :any:`SCCAnnotateTransformation` - Annotate loops according to
programming model (``directive``).
Expand All @@ -73,24 +86,87 @@
demote_local_arrays : bool
Flag to trigger local array demotion to scalar variables where possible
"""
SCCVectorPipeline = partial(

# alias for backwards compability
SCCVectorPipeline = SCCVVectorPipeline

SCCSVectorPipeline = partial(
Pipeline, classes=(
SCCFuseVerticalLoops,
SCCBaseTransformation,
SCCDevectorTransformation,
SCCDemoteTransformation,
SCCRevectorTransformation,
SCCSeqRevectorTransformation,
SCCAnnotateTransformation
)
)
"""
The basic Single Column Coalesced (SCC) transformation with
sequential kernels.
This tranformation will convert kernels with innermost vectorisation
along a common horizontal dimension to a GPU-friendly loop-layout via
loop inversion and local array variable demotion. The resulting kernel
becomes sequential as the ``horizontal`` loop is hoisted to the driver
and the loop index becomes an argument to the kernel(s).
Moreover, this allows local temporary arrays to be demoted to scalars,
where possible.
The outer "driver" loop over blocks is used as the secondary dimension
of parallelism, where the outher data indexing dimension
(``block_dim``) is resolved in the first call to a "kernel"
routine. This is equivalent to a so-called "gang-vector" parallisation
scheme.
This :any:`Pipeline` applies the following :any:`Transformation`
classes in sequence:
1. :any:`SCCBaseTransformation` - Ensure utility variables and resolve
problematic code constructs.
2. :any:`SCCDevectorTransformation` - Remove horizontal vector loops.
3. :any:`SCCDemoteTransformation` - Demote local temporary array
variables where appropriate.
4. :any:`SCCSeqRevectorTransformation` - Re-insert the vecotr loops outermost,
according to identified vector sections.
5. :any:`SCCAnnotateTransformation` - Annotate loops according to
programming model (``directive``).
Parameters
----------
horizontal : :any:`Dimension`
:any:`Dimension` object describing the variable conventions used in code
to define the horizontal data dimension and iteration space.
block_dim : :any:`Dimension`
Optional ``Dimension`` object to define the blocking dimension
to use for hoisted column arrays if hoisting is enabled.
directive : string or None
Directives flavour to use for parallelism annotations; either
``'openacc'`` or ``None``.
trim_vector_sections : bool
Flag to trigger trimming of extracted vector sections to remove
nodes that are not assignments involving vector parallel arrays.
demote_local_arrays : bool
Flag to trigger local array demotion to scalar variables where possible
"""

SCCVHoistPipeline = partial(
Pipeline, classes=(
SCCFuseVerticalLoops,
SCCBaseTransformation,
SCCDevectorTransformation,
SCCDemoteTransformation,
SCCVecRevectorTransformation,
HoistTemporaryArraysAnalysis,
SCCHoistTemporaryArraysTransformation,
SCCAnnotateTransformation
)
)
"""
SCC-style transformation that additionally hoists local temporary
SCC-style transformation with "vector-parallel" kernels
that additionally hoists local temporary
arrays that cannot be demoted to the outer driver call.
For details of the kernel and driver-side transformations, please
refer to :any:`SCCVectorPipeline`
refer to :any:`SCCVVectorPipeline`
In addition, this pipeline will invoke
:any:`HoistTemporaryArraysAnalysis` and
Expand Down Expand Up @@ -120,26 +196,77 @@
hoisted. If not provided, no checks will be done for the array
dimensions in :any:`HoistTemporaryArraysAnalysis`.
"""
SCCHoistPipeline = partial(

SCCSHoistPipeline = partial(
Pipeline, classes=(
SCCFuseVerticalLoops,
SCCBaseTransformation,
SCCDevectorTransformation,
SCCDemoteTransformation,
SCCRevectorTransformation,
SCCSeqRevectorTransformation,
HoistTemporaryArraysAnalysis,
SCCHoistTemporaryArraysTransformation,
SCCAnnotateTransformation
)
)
"""
SCC-style transformation with sequential kernels
that additionally hoists local temporary
arrays that cannot be demoted to the outer driver call.
For details of the kernel and driver-side transformations, please
refer to :any:`SCCSVectorPipeline`
In addition, this pipeline will invoke
:any:`HoistTemporaryArraysAnalysis` and
:any:`SCCHoistTemporaryArraysTransformation` before the final
annotation step to hoist multi-dimensional local temporary array
variables to the "driver" routine, where they will be allocated on
device and passed down as arguments.
Parameters
----------
horizontal : :any:`Dimension`
:any:`Dimension` object describing the variable conventions used in code
to define the horizontal data dimension and iteration space.
block_dim : :any:`Dimension`
Optional ``Dimension`` object to define the blocking dimension
to use for hoisted column arrays if hoisting is enabled.
directive : string or None
Directives flavour to use for parallelism annotations; either
``'openacc'`` or ``None``.
trim_vector_sections : bool
Flag to trigger trimming of extracted vector sections to remove
nodes that are not assignments involving vector parallel arrays.
demote_local_arrays : bool
Flag to trigger local array demotion to scalar variables where possible
dim_vars: tuple of str, optional
Variables to be within the dimensions of the arrays to be
hoisted. If not provided, no checks will be done for the array
dimensions in :any:`HoistTemporaryArraysAnalysis`.
"""
SCC-style transformation that additionally pre-allocates a "stack"

# alias for backwards compability
SCCHoistPipeline = SCCVHoistPipeline

SCCVStackPipeline = partial(
Pipeline, classes=(
SCCFuseVerticalLoops,
SCCBaseTransformation,
SCCDevectorTransformation,
SCCDemoteTransformation,
SCCVecRevectorTransformation,
SCCAnnotateTransformation,
TemporariesPoolAllocatorTransformation
)
)
"""
SCC-style transformation with "vector-parallel" kernels
that additionally pre-allocates a "stack"
pool allocator and associates local arrays with preallocated memory.
For details of the kernel and driver-side transformations, please
refer to :any:`SCCVectorPipeline`
refer to :any:`SCCVVectorPipeline`
In addition, this pipeline will invoke
:any:`TemporariesPoolAllocatorTransformation` to back the remaining
Expand All @@ -166,28 +293,31 @@
Insert bounds-checks in the kernel to make sure the allocated
stack size is not exceeded (default: `True`)
"""
SCCStackPipeline = partial(

# alias for backwards compability
SCCStackPipeline = SCCVStackPipeline

SCCSStackPipeline = partial(
Pipeline, classes=(
SCCFuseVerticalLoops,
SCCBaseTransformation,
SCCDevectorTransformation,
SCCDemoteTransformation,
SCCRevectorTransformation,
SCCSeqRevectorTransformation,
SCCAnnotateTransformation,
TemporariesPoolAllocatorTransformation
)
)

"""
SCC-style transformation that additionally pre-allocates a "stack"
pool allocator and replaces local temporaries with indexed sub-arrays
of this preallocated array.
SCC-style transformation with sequential kernels
that additionally pre-allocates a "stack"
pool allocator and associates local arrays with preallocated memory.
For details of the kernel and driver-side transformations, please
refer to :any:`SCCVectorPipeline`
refer to :any:`SCCSVectorPipeline`
In addition, this pipeline will invoke
:any:`TemporariesRawStackTransformation` to back the remaining
:any:`TemporariesPoolAllocatorTransformation` to back the remaining
locally allocated arrays from a "stack" pool allocator that is
pre-allocated in the driver routine and passed down via arguments.
Expand All @@ -210,10 +340,8 @@
check_bounds : bool, optional
Insert bounds-checks in the kernel to make sure the allocated
stack size is not exceeded (default: `True`)
driver_horizontal : str, optional
Override string if a separate variable name should be used for the
horizontal when allocating the stack in the driver.
"""

SCCRawStackPipeline = partial(
Pipeline, classes=(
SCCBaseTransformation,
Expand All @@ -224,3 +352,39 @@
TemporariesRawStackTransformation
)
)
"""
SCC-style transformation that additionally pre-allocates a "stack"
pool allocator and replaces local temporaries with indexed sub-arrays
of this preallocated array.
For details of the kernel and driver-side transformations, please
refer to :any:`SCCVectorPipeline`
In addition, this pipeline will invoke
:any:`TemporariesRawStackTransformation` to back the remaining
locally allocated arrays from a "stack" pool allocator that is
pre-allocated in the driver routine and passed down via arguments.
Parameters
----------
horizontal : :any:`Dimension`
:any:`Dimension` object describing the variable conventions used in code
to define the horizontal data dimension and iteration space.
block_dim : :any:`Dimension`
Optional ``Dimension`` object to define the blocking dimension
to use for hoisted column arrays if hoisting is enabled.
directive : string or None
Directives flavour to use for parallelism annotations; either
``'openacc'`` or ``None``.
trim_vector_sections : bool
Flag to trigger trimming of extracted vector sections to remove
nodes that are not assignments involving vector parallel arrays.
demote_local_arrays : bool
Flag to trigger local array demotion to scalar variables where possible
check_bounds : bool, optional
Insert bounds-checks in the kernel to make sure the allocated
stack size is not exceeded (default: `True`)
driver_horizontal : str, optional
Override string if a separate variable name should be used for the
horizontal when allocating the stack in the driver.
"""
36 changes: 28 additions & 8 deletions loki/transformations/single_column/tests/test_scc_hoist.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
SCCBaseTransformation, SCCDevectorTransformation,
SCCDemoteTransformation, SCCRevectorTransformation,
SCCAnnotateTransformation, SCCHoistPipeline,
SCCVHoistPipeline, SCCSHoistPipeline
)


Expand All @@ -45,7 +46,8 @@ def fixture_blocking():


@pytest.mark.parametrize('frontend', available_frontends())
def test_scc_hoist_multiple_kernels(frontend, horizontal, blocking):
@pytest.mark.parametrize('hoist_pipeline', [SCCVHoistPipeline, SCCSHoistPipeline])
def test_scc_hoist_multiple_kernels(frontend, horizontal, blocking, hoist_pipeline):
"""
Test hoisting of column temporaries to "driver" level.
"""
Expand Down Expand Up @@ -99,7 +101,7 @@ def test_scc_hoist_multiple_kernels(frontend, horizontal, blocking):
driver_item = ProcedureItem(name='#column_driver', source=driver_source)
kernel_item = ProcedureItem(name='#compute_column', source=kernel_source)

scc_hoist = SCCHoistPipeline(
scc_hoist = hoist_pipeline(
horizontal=horizontal, block_dim=blocking, directive='openacc'
)

Expand All @@ -112,11 +114,16 @@ def test_scc_hoist_multiple_kernels(frontend, horizontal, blocking):

# Ensure we two loops left in kernel
kernel_loops = FindNodes(Loop).visit(kernel.body)
assert len(kernel_loops) == 2
assert kernel_loops[0].variable == 'jl'
assert kernel_loops[0].bounds == 'start:end'
assert kernel_loops[1].variable == 'jk'
assert kernel_loops[1].bounds == '2:nz'
if hoist_pipeline == SCCSHoistPipeline:
assert len(kernel_loops) == 1
assert kernel_loops[0].variable == 'jk'
assert kernel_loops[0].bounds == '2:nz'
else:
assert len(kernel_loops) == 2
assert kernel_loops[0].variable == 'jl'
assert kernel_loops[0].bounds == 'start:end'
assert kernel_loops[1].variable == 'jk'
assert kernel_loops[1].bounds == '2:nz'

# Ensure all expressions and array indices are unchanged
assigns = FindNodes(Assignment).visit(kernel.body)
Expand All @@ -126,7 +133,20 @@ def test_scc_hoist_multiple_kernels(frontend, horizontal, blocking):

# Ensure we have only one driver block loop
driver_loops = FindNodes(Loop).visit(driver.body)
assert len(driver_loops) == 1
if hoist_pipeline == SCCSHoistPipeline:
assert len(driver_loops) == 3
assert driver_loops[1].variable == 'jl'
assert driver_loops[1].bounds == 'start:end'
calls = FindNodes(CallStatement).visit(driver_loops[1])
assert len(calls) == 1
assert calls[0].name == 'compute_column'
assert driver_loops[2].variable == 'jl'
assert driver_loops[2].bounds == 'start:end'
calls = FindNodes(CallStatement).visit(driver_loops[2])
assert len(calls) == 1
assert calls[0].name == 'compute_column'
else:
assert len(driver_loops) == 1
assert driver_loops[0].variable == 'b'
assert driver_loops[0].bounds == '1:nb'

Expand Down
Loading

0 comments on commit f3591ec

Please sign in to comment.