diff --git a/.github/workflows/run_notebooks.yaml b/.github/workflows/run_notebooks.yaml
index 01d8c30..0291052 100644
--- a/.github/workflows/run_notebooks.yaml
+++ b/.github/workflows/run_notebooks.yaml
@@ -15,46 +15,51 @@ jobs:
   notebooks:
     name: Running Docs Notebooks
     runs-on: ubuntu-latest
-    defaults:
-     run:
-       # Adding -l {0} ensures conda can be found properly in each step
-       shell: bash -l {0}
+    # defaults:
+    #  run:
+    #    # Adding -l {0} ensures conda can be found properly in each step
+    #    shell: bash -l {0}
     steps:
       - uses: actions/checkout@main
         with:
           fetch-depth: 1
-
-      - name: Cache conda
-        uses: actions/cache@v2
-        env:
-          # Increase this value to reset cache if ci/test-env.yml has not changed
-          CACHE_NUMBER: 0
+      - uses: mpi4py/setup-mpi@v1
+      - name: Setup Python
+        uses: actions/setup-python@v4
         with:
-          path: ~/conda_pkgs_dir
-          key:
-            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ matrix.python-version }}-${{ hashFiles('ci/test-env.yml', 'setup.cfg') }}
+          python-version: ${{ matrix.python-version }}
 
-      - name: Setup Miniconda
-        uses: conda-incubator/setup-miniconda@v2.1.1
-        with:
-          # auto-update-conda: true
-          miniconda-version: "latest"
-          python-version: '3.10'
-          environment-file: ci/notebook-env.yml
-          activate-environment: viscpu
-          channels: conda-forge,defaults
-          channel-priority: strict
-          use-only-tar-bz2: true
+      # - name: Cache conda
+      #   uses: actions/cache@v2
+      #   env:
+      #     # Increase this value to reset cache if ci/test-env.yml has not changed
+      #     CACHE_NUMBER: 0
+      #   with:
+      #     path: ~/conda_pkgs_dir
+      #     key:
+      #       ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ matrix.python-version }}-${{ hashFiles('ci/test-env.yml', 'setup.cfg') }}
 
-      - name: Conda Info
-        run: |
-          conda info -a
-          conda list
+      # - name: Setup Miniconda
+      #   uses: conda-incubator/setup-miniconda@v2.1.1
+      #   with:
+      #     # auto-update-conda: true
+      #     miniconda-version: "latest"
+      #     python-version: '3.10'
+      #     environment-file: ci/notebook-env.yml
+      #     activate-environment: viscpu
+      #     channels: conda-forge,defaults
+      #     channel-priority: strict
+      #     use-only-tar-bz2: true
+
+      # - name: Conda Info
+      #   run: |
+      #     conda info -a
+      #     conda list
 
       - name: Install
         run: |
           echo $(which pip)
-          pip install .[test]
+          pip install .[test] papermill jupyter ipykernel
 
       - name: Install ipykernel
         run: python -m ipykernel install --user --name viscpu --display-name "viscpu"
diff --git a/.github/workflows/test_suite.yaml b/.github/workflows/test_suite.yaml
index 931461d..01839bc 100644
--- a/.github/workflows/test_suite.yaml
+++ b/.github/workflows/test_suite.yaml
@@ -12,63 +12,88 @@ on:
       - 'main'
 
 jobs:
-  tests:
+  cpu_tests:
     env:
       ENV_NAME: tests
       PYTHON: ${{ matrix.python-version }}
       OS: ${{ matrix.os }}
+      LOG_LEVEL: ${{ (matrix.os == 'macos-latest' && 'WARNING') || 'INFO' }} # Suppress logging on macOS
     name: Testing
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, macos-latest]
-        python-version: [3.8, 3.9, "3.10"]
-    defaults:
-     run:
-       # Adding -l {0} ensures conda can be found properly in each step
-       shell: bash -l {0}
+        python-version: [3.9, "3.10", "3.11"]
     steps:
       - uses: actions/checkout@main
         with:
           fetch-depth: 1
+      - uses: mpi4py/setup-mpi@v1
+      - uses: FedericoCarboni/setup-ffmpeg@v2
 
-      - name: Cache conda
-        uses: actions/cache@v2
-        env:
-          # Increase this value to reset cache if ci/test-env.yml has not changed
-          CACHE_NUMBER: 0
+      - name: Setup Python
+        uses: actions/setup-python@v4
         with:
-          path: ~/conda_pkgs_dir
-          key:
-            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ matrix.python-version }}-${{ hashFiles('ci/test-env.yml', 'setup.cfg') }}
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install
+        run: |
+          pip install .[test]
+
+      - name: Run Tests
+        run: |
+          python -m pytest --ignore tests/test_plot.py --cov=vis_cpu --cov-config=.coveragerc --cov-report xml:./coverage.xml --durations=25 --log-cli-level=${{ env.LOG_LEVEL}}
+
+      - name: Run Plotting Tests
+        if: matrix.python-version != '3.9' || matrix.os == 'macos-latest'
+        run: |
+          python -m pytest tests/test_plot.py --cov=vis_cpu --cov-config=.coveragerc --cov-append --cov-report xml:./coverage.xml --durations=25 --log-cli-level=INFO
 
-      - name: Setup Miniconda
-        uses: conda-incubator/setup-miniconda@v2.1.1
+      - name: Upload coverage report
+        uses: codecov/codecov-action@v3.1.3
         with:
-          # auto-update-conda: true
-          miniconda-version: "latest"
-          python-version: ${{ matrix.python-version }}
-          environment-file: ci/test-env.yml
-          activate-environment: tests
-          channels: conda-forge,defaults
-          channel-priority: strict
-          use-only-tar-bz2: true
+          file: ./coverage.xml
+          flags: unittests
+          name: codecov-umbrella
+          fail_ci_if_error: true
 
-      - name: Conda Info
+  gpu_tests:
+    env:
+      ENV_NAME: tests
+      PYTHON: "3.10"
+      OS: Ubuntu
+    name: Self-Hosted Tests (GPU)
+    runs-on: [self-hosted, gpu]
+
+    steps:
+      - name: Add Home to PATH
         run: |
-          conda info -a
-          conda list
-          PYVER=`python -c "import sys; print('{:d}.{:d}'.format(sys.version_info.major, sys.version_info.minor))"`
-          if [[ $PYVER != $PYTHON ]]; then
-            exit 1;
-          fi
+          echo "/home/locoadmin/bin" >> $GITHUB_PATH
+          echo "/usr/local/cuda/bin" >> $GITHUB_PATH
+
+      - uses: actions/checkout@main
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ env.PYTHON }}
 
       - name: Install
         run: |
           echo $(which pip)
-          pip install .[test]
+          pip install .[test,gpu]
 
       - name: Run Tests
         run: |
-          python -m pytest --cov=vis_cpu --cov-config=.coveragerc --cov-report xml:./coverage.xml --durations=25
+          python -m pytest -k "gpu" --cov=vis_cpu --cov-config=.coveragerc --cov-report xml:./coverage.xml --durations=25 --log-cli-level=INFO
+
+      - name: Upload coverage report
+        uses: codecov/codecov-action@v3.1.3
+        with:
+          file: ./coverage.xml
+          flags: unittests
+          name: codecov-umbrella
+          fail_ci_if_error: true
diff --git a/.github/workflows/test_suite_gpu.yaml b/.github/workflows/test_suite_gpu.yaml
deleted file mode 100644
index f21f221..0000000
--- a/.github/workflows/test_suite_gpu.yaml
+++ /dev/null
@@ -1,82 +0,0 @@
-name: GPU Tests
-
-# Test on all pushes, except when the push is literally just a tag (because we
-# tag automatically via CI, and therefore there's no extra code in that push).
-# Also, only test on pull requests into master/dev.
-on:
-  push:
-    tags-ignore:
-      - 'v*'
-  pull_request:
-    branches:
-      - 'main'
-
-jobs:
-  tests:
-    env:
-      ENV_NAME: tests
-      PYTHON: "3.10"
-      OS: Ubuntu
-    name: Self-Hosted Tests (GPU)
-    runs-on: [self-hosted, gpu]
-    defaults:
-     run:
-       # Adding -l {0} ensures conda can be found properly in each step
-       shell: bash -l {0}
-    steps:
-      - name: Add Home to PATH
-        run: |
-          echo "/home/locoadmin/bin" >> $GITHUB_PATH
-          echo "/usr/local/cuda/bin" >> $GITHUB_PATH
-          echo "LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV
-
-      - uses: actions/checkout@main
-        with:
-          fetch-depth: 0
-
-      - name: Cache conda
-        uses: actions/cache@v2
-        env:
-          # Increase this value to reset cache if ci/test-env.yml has not changed
-          CACHE_NUMBER: 0
-        with:
-          path: ~/conda_pkgs_dir
-          key:
-            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ env.PYTHON }}-${{ hashFiles('ci/test-env.yml', 'setup.cfg') }}
-
-      - name: Setup Miniconda
-        uses: conda-incubator/setup-miniconda@v2.1.1
-        with:
-          # auto-update-conda: true
-          miniconda-version: "latest"
-          python-version: ${{ env.PYTHON }}
-          environment-file: ci/test-env.yml
-          activate-environment: tests
-          channels: conda-forge,defaults
-          channel-priority: strict
-          use-only-tar-bz2: true
-
-      - name: Conda Info
-        run: |
-          conda info -a
-          conda list
-          PYVER=`python -c "import sys; print('{:d}.{:d}'.format(sys.version_info.major, sys.version_info.minor))"`
-          if [[ $PYVER != $PYTHON ]]; then
-            exit 1;
-          fi
-
-      - name: Install
-        run: |
-          echo $(which pip)
-          pip install .[test,gpu]
-
-      - name: Run Tests
-        run: |
-          python -m pytest --cov=vis_cpu --cov-config=.coveragerc --cov-report xml:./coverage.xml --durations=25 --log-cli-level=DEBUG
-
-      - uses: codecov/codecov-action@v2
-        if: success()
-        with:
-          fail_ci_if_error: true
-          verbose: true
-          file: ./coverage.xml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b1b1ac2..0846420 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ exclude: '^docs/conf.py|^src/vis_cpu/data/'
 
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.3.0
+  rev: v4.4.0
   hooks:
   - id: trailing-whitespace
   - id: check-added-large-files
@@ -18,7 +18,7 @@ repos:
     args: ['--fix=no']
 
 - repo: https://github.com/PyCQA/flake8
-  rev: 4.0.1
+  rev: 6.0.0
   hooks:
   - id: flake8
     additional_dependencies:
@@ -33,27 +33,27 @@ repos:
       - flake8-print
 
 - repo: https://github.com/psf/black
-  rev: 22.6.0
+  rev: 23.3.0
   hooks:
   - id: black
 
 - repo: https://github.com/pre-commit/pygrep-hooks
-  rev: v1.9.0
+  rev: v1.10.0
   hooks:
     - id: rst-backticks
 
 - repo: https://github.com/PyCQA/isort
-  rev: 5.10.1
+  rev: 5.12.0
   hooks:
   - id: isort
 
 - repo: https://github.com/asottile/pyupgrade
-  rev: v2.37.1
+  rev: v3.3.2
   hooks:
   - id: pyupgrade
-    args: [--py38-plus]
+    args: [--py39-plus]
 
 - repo: https://github.com/asottile/setup-cfg-fmt
-  rev: v1.20.2
+  rev: v2.2.0
   hooks:
   - id: setup-cfg-fmt
diff --git a/.readthedocs.yml b/.readthedocs.yml
index cd75f7a..0f3e92e 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -9,12 +9,16 @@ version: 2
 sphinx:
   configuration: docs/conf.py
 
-conda:
-  environment: docs/environment.yaml
+# conda:
+#   environment: docs/environment.yaml
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
 
 python:
-   version: 3.7
    install:
       - method: pip
         path: .
-   system_packages: true
+        extra_requirements:
+           - docs
diff --git a/ci/test-env.yml b/ci/test-env.yml
index 473e614..26ee820 100644
--- a/ci/test-env.yml
+++ b/ci/test-env.yml
@@ -3,18 +3,18 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - mpi4py>=3.0,<4  # Required here so that we get
-  - astropy>=4,<5
-  - numpy>=1.20,<2.0
-  - scipy>=1.6,<2.0
-  - pip>=21.1,<22
+  - mpi4py>=3.0
+  - astropy>=4
+  - numpy>=1.20
+  - scipy>=1.6
+  - pip>=21.1
   - pytest>=6.2.4
   - pytest-cov>=2.11.1
-  - matplotlib>=3.3.4,<4
-  - ipython>=7.22,<8
-  - h5py>=3.2,<4
+  - matplotlib>=3.3.4
+  - ipython>=7.22
+  - h5py>=3.2
   - ffmpeg
   - pyuvdata  # For testing when using UVBeam object
   - pip:
-    - pyuvsim[sim]>=1.2,<1.4
-    - pyradiosky>=0.1.1,<0.3
+    - pyuvsim[sim]>=1.2
+    - pyradiosky>=0.1.1
diff --git a/codecov.yml b/codecov.yml
index a864649..cd09ab7 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,2 +1,4 @@
 fixes:
   - "/home/runner/work/vis_cpu/::"
+codecov:
+ token: f1912835-f0e2-4153-891b-f68eb359207f
diff --git a/docs/environment.yaml b/docs/environment.yaml
index 8747144..3c4cf34 100644
--- a/docs/environment.yaml
+++ b/docs/environment.yaml
@@ -7,6 +7,6 @@ dependencies:
   - ipython
   - nbsphinx
   - numpydoc
-  - sphinx>=4.0
+  - sphinx>=5.0
   - pip:
     - furo
diff --git a/docs/templates/custom-module.rst b/docs/templates/custom-module.rst
index 49e7134..a848390 100644
--- a/docs/templates/custom-module.rst
+++ b/docs/templates/custom-module.rst
@@ -2,6 +2,7 @@
 
 .. automodule:: {{ fullname }}
    :exclude-members: profile
+
    {% block attributes %}
    {% if attributes %}
    .. rubric:: {{ _('Module Attributes') }}
diff --git a/setup.cfg b/setup.cfg
index cf94ee5..9ce731e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -13,15 +13,18 @@ classifiers =
     Development Status :: 4 - Beta
     License :: OSI Approved :: MIT License
     Programming Language :: Python
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3 :: Only
 
 [options]
 packages = find_namespace:
 install_requires =
     astropy
     numpy
+    psutil
     pyuvdata>=2.2.8
     scipy
-    typing-extensions;python_version<'3.8'
+python_requires = >=3.9
 include_package_data = True
 package_dir =
     =src
@@ -37,11 +40,16 @@ console_scripts =
     viscpu = vis_cpu.cli:main
 
 [options.extras_require]
+all =
+    vis-cpu[gpu,profile,dev]
+dev =
+    vis-cpu[docs,test]
 docs =
+    furo
+    ipython
     nbsphinx
     numpydoc
     sphinx
-    sphinx-rtd-theme
 gpu =
     jinja2
     pycuda
@@ -49,14 +57,14 @@ gpu =
 profile =
     click
     line-profiler
-    pyuvsim
+    pyuvsim>=1.2.5
 test =
     ipython
     matplotlib
     pyradiosky
     pytest
     pytest-cov
-    pyuvsim[sim]
+    pyuvsim[sim]>=1.2.5
 
 [test]
 extras = True
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 87fecd8..0000000
--- a/setup.py
+++ /dev/null
@@ -1,4 +0,0 @@
-"""Setup the package."""
-from setuptools import setup
-
-setup()
diff --git a/src/vis_cpu/_utils.py b/src/vis_cpu/_utils.py
index 4a7fe17..1398e93 100644
--- a/src/vis_cpu/_utils.py
+++ b/src/vis_cpu/_utils.py
@@ -1,3 +1,23 @@
+import logging
+
+
 def no_op(fnc):
     """No-op function."""
     return fnc
+
+
+def human_readable_size(size, decimal_places=2, indicate_sign=False):
+    """Get a human-readable data size.
+
+    From: https://stackoverflow.com/a/43690506/1467820
+    """
+    for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]:
+        if abs(size) < 1024.0:
+            break
+        if unit != "PiB":
+            size /= 1024.0
+
+    if indicate_sign:
+        return f"{size:+.{decimal_places}f} {unit}"
+    else:
+        return f"{size:.{decimal_places}f} {unit}"
diff --git a/src/vis_cpu/_uvbeam_to_raw.py b/src/vis_cpu/_uvbeam_to_raw.py
index 1eb8e2a..44adf02 100644
--- a/src/vis_cpu/_uvbeam_to_raw.py
+++ b/src/vis_cpu/_uvbeam_to_raw.py
@@ -127,6 +127,9 @@ def uvbeam_to_azza_grid(
             "The beam data does not cover the full sky. Cannot use in vis_cpu."
         )
 
+    if not uvbeam.future_array_shapes:
+        uvbeam.use_future_array_shapes()
+
     # Simplest Case: everything is already in the regular format we need.
     if (
         naz == len(az)
@@ -135,14 +138,14 @@ def uvbeam_to_azza_grid(
         and covers_sky_strong
     ):
         # Returned data has shape (Nax, Nfeeds, Nza, Naz)
-        return uvbeam.data_array[:, 0, :, 0], delta_az, dza
+        return uvbeam.data_array[:, :, 0], delta_az, dza
     elif (
         naz == len(az)
         and np.isclose(dza, delta_za)
         and is_regular_grid
         and covers_sky_almost_strong
     ):
-        data = uvbeam.data_array[:, 0, :, 0]
+        data = uvbeam.data_array[:, :, 0]
         data = np.concatenate((data, data[..., [0]]), axis=-1)
         return data, delta_az, dza
     else:
@@ -150,7 +153,8 @@ def uvbeam_to_azza_grid(
             "The raw beam data is either irregular, or does not have the spacing you "
             "desire. This means we need to interpolate to a grid, from which a second "
             "round of interpolation will be performed in the visibility calculation."
-            "You might be able to avoid this by not specifying a desired naz and dza."
+            "You might be able to avoid this by not specifying a desired naz and dza.",
+            stacklevel=1,
         )
 
         # We have to treat az and za differently. For az, we need to start at 0 and end at 2pi exactly.
@@ -169,11 +173,11 @@ def uvbeam_to_azza_grid(
 
         # Returned data has shape (Nax, Nfeeds, Nza, Naz)
         if uvbeam.beam_type == "efield":
-            return out[:, 0, :, 0], new_az[1] - new_az[0], dza
+            return out[:, :, 0], new_az[1] - new_az[0], dza
         else:
             # For a power beam, we just want to the I part of the XX pol.
             # Also, need the sqrt of the beam (to make it quasi X)
-            out = out[0, 0, 0, 0]
+            out = out[0, 0, 0]
 
             # But we need to return it with the nax and nfeed dimensions (both have size 1)
             return out[np.newaxis, np.newaxis], new_az[1] - new_az[0], dza
diff --git a/src/vis_cpu/cli.py b/src/vis_cpu/cli.py
index 378b156..2d8b39e 100644
--- a/src/vis_cpu/cli.py
+++ b/src/vis_cpu/cli.py
@@ -35,7 +35,7 @@
     "eq2top": ("np.dot(eq2top",),
     "beam_interp": ("_evaluate_beam_cpu(",),
     "get_tau": ("np.dot(antpos",),
-    "get_antenna_vis": ("v = get_antenna_vis(",),
+    "get_antenna_vis": ("v = _get_antenna_vis(",),
     "get_baseline_vis": ("vis[t] =",),
 }
 
@@ -44,7 +44,7 @@
     "beam_interp": ("do_beam_interpolation(",),
     "get_tau": ("# compute tau",),
     "get_antenna_vis": ("meas_eq(",),
-    "get_baseline_vis": ("vis_inner_product(",),
+    "get_baseline_vis": ("cublas_complex_mm(",),
 }
 
 profiler = LineProfiler()
@@ -191,7 +191,7 @@ def profile(
     print("------------- Summary of timings -------------")
     for thing, (hits, time, time_per_hit, percent, nlines) in thing_stats.items():
         print(
-            f"{thing:>19}: {hits:>4} hits, {time:.2f} seconds, {time_per_hit:.2f} sec/hit, {percent:3.2f}%, {nlines} lines"
+            f"{thing:>19}: {hits:>4} hits, {time:.3e} seconds, {time_per_hit:.3e} sec/hit, {percent:3.2f}%, {nlines} lines"
         )
     print("----------------------------------------------")
 
@@ -201,8 +201,9 @@ def profile(
 
 def get_line_based_stats(lstats) -> tuple[dict, float]:
     """Convert the line-number based stats into line-based stats."""
+    time_unit = lstats.unit
     (fn, lineno, name), timings = sorted(lstats.timings.items())[0]
-    d, total_time = get_stats_and_lines(fn, lineno, timings)
+    d, total_time = get_stats_and_lines(fn, lineno, timings, time_unit)
     return d, total_time
 
 
@@ -244,7 +245,7 @@ def get_summary_stats(line_data, total_time, ids):
     return thing_stats
 
 
-def get_stats_and_lines(filename, start_lineno, timings):
+def get_stats_and_lines(filename, start_lineno, timings, time_unit):
     """Match up timing stats with line content of the code."""
     d = {}
     total_time = 0.0
@@ -267,13 +268,13 @@ def get_stats_and_lines(filename, start_lineno, timings):
 
         d[sublines[idx].rstrip("\n").rstrip("\r")] = (
             nhits,
-            time / 1e6,
-            float(time) / nhits / 1e6,
+            time * time_unit,
+            float(time) / nhits * time_unit,
             percent,
             lineno,
         )
 
-    return d, total_time / 1e6
+    return d, total_time * time_unit
 
 
 def get_standard_sim_params(
diff --git a/src/vis_cpu/cpu.py b/src/vis_cpu/cpu.py
index 70de0ec..2d71603 100644
--- a/src/vis_cpu/cpu.py
+++ b/src/vis_cpu/cpu.py
@@ -1,16 +1,23 @@
 """CPU-based implementation of the visibility simulator."""
 from __future__ import annotations
 
+import datetime
+import gc
+import linecache
 import logging
 import numpy as np
-import warnings
+import psutil
+import time
+import tracemalloc as tm
 from astropy.constants import c
+from collections.abc import Sequence
+
+# from pympler import tracker
 from pyuvdata import UVBeam
-from re import I
-from scipy.interpolate import RectBivariateSpline
-from typing import Callable, Optional, Sequence
+from typing import Callable
 
 from . import conversions
+from ._utils import human_readable_size
 
 # This enables us to put in profile decorators that will be no-ops if no profiling
 # library is being used.
@@ -139,6 +146,8 @@ def _evaluate_beam_cpu(
             if isinstance(bm, UVBeam)
             else {}
         )
+        if isinstance(bm, UVBeam) and not bm.future_array_shapes:
+            bm.use_future_array_shapes()
 
         interp_beam = bm.interp(
             az_array=az,
@@ -148,11 +157,11 @@ def _evaluate_beam_cpu(
         )[0]
 
         if polarized:
-            interp_beam = interp_beam[:, 0, :, 0, :]
+            interp_beam = interp_beam[:, :, 0, :]
         else:
             # Here we have already asserted that the beam is a power beam and
             # has only one polarization, so we just evaluate that one.
-            interp_beam = np.sqrt(interp_beam[0, 0, 0, 0, :])
+            interp_beam = np.sqrt(interp_beam[0, 0, 0, :])
 
         A_s[:, :, i] = interp_beam
 
@@ -200,6 +209,7 @@ def vis_cpu(
     polarized: bool = False,
     beam_idx: np.ndarray | None = None,
     beam_spline_opts: dict | None = None,
+    max_progress_reports: int = 100,
 ):
     """
     Calculate visibility from an input intensity map and beam model.
@@ -249,6 +259,11 @@ def vis_cpu(
         Optional length-NANT array specifying a beam index for each antenna.
         By default, either a single beam is assumed to apply to all antennas or
         each antenna gets its own beam.
+    beam_spline_opts : dict, optional
+        Dictionary of options to pass to the beam interpolation function.
+    max_progress_reports : int, optional
+        Maximum number of progress reports to print to the screen (if logging level
+        allows). Default is 100.
 
     Returns
     -------
@@ -257,6 +272,11 @@ def vis_cpu(
         shape (NTIMES, NFEED, NFEED, NANTS, NANTS), otherwise it will have
         shape (NTIMES, NANTS, NANTS).
     """
+    if not tm.is_tracing() and logger.isEnabledFor(logging.INFO):
+        tm.start()
+
+    highest_peak = _memtrace(0)
+
     nax, nfeed, nant, ntimes = _validate_inputs(
         precision, polarized, antpos, eq2tops, crd_eq, I_sky
     )
@@ -281,9 +301,20 @@ def vis_cpu(
     ang_freq = real_dtype(2.0 * np.pi * freq)
 
     # Zero arrays: beam pattern, visibilities, delays, complex voltages
-    vis = np.zeros((ntimes, nfeed * nant, nfeed * nant), dtype=complex_dtype)
+    vis = np.full((ntimes, nfeed * nant, nfeed * nant), 0.0, dtype=complex_dtype)
+    logger.info(f"Visibility Array takes {vis.nbytes/1024**2:.1f} MB")
+
     crd_eq = crd_eq.astype(real_dtype)
 
+    # Have up to 100 reports as it iterates through time.
+    report_chunk = ntimes // max_progress_reports + 1
+    pr = psutil.Process()
+    tstart = time.time()
+    mlast = pr.memory_info().rss
+    plast = tstart
+
+    highest_peak = _memtrace(highest_peak)
+
     # Loop over time samples
     for t, eq2top in enumerate(eq2tops.astype(real_dtype)):
         # Dot product converts ECI cosines (i.e. from RA and Dec) into ENU
@@ -296,7 +327,7 @@ def vis_cpu(
         nsrcs_up = len(tx)
         isqrt = Isqrt[above_horizon]
 
-        A_s = np.zeros((nax, nfeed, nbeam, nsrcs_up), dtype=complex_dtype)
+        A_s = np.full((nax, nfeed, nbeam, nsrcs_up), 0.0, dtype=complex_dtype)
 
         _evaluate_beam_cpu(
             A_s,
@@ -325,6 +356,10 @@ def vis_cpu(
         vis[t] = v.conj().dot(v.T)
         _log_array("vis", vis[t])
 
+        if not (t % report_chunk or t == ntimes - 1):
+            plast, mlast = _log_progress(tstart, plast, t + 1, ntimes, pr, mlast)
+            highest_peak = _memtrace(highest_peak)
+
     vis.shape = (ntimes, nfeed, nant, nfeed, nant)
 
     # Return visibilities with or without multiple polarization channels
@@ -348,9 +383,47 @@ def _get_antenna_vis(
     return v.reshape((nfeed * nant, nax * nsrcs_up))  # reform into matrix
 
 
+def _memtrace(highest_peak) -> int:
+    if logger.isEnabledFor(logging.INFO):
+        cm, pm = tm.get_traced_memory()
+        logger.info(f"Starting Memory usage  : {cm/1024**3:.3f} GB")
+        logger.info(f"Starting Peak Mem usage: {pm/1024**3:.3f} GB")
+        logger.info(f"Traemalloc Peak Memory (tot)(GB): {highest_peak / 1024**3:.2f}")
+        tm.reset_peak()
+        return max(pm, highest_peak)
+
+
 def _log_array(name, x):
     """Debug logging of the value of an array."""
-    if logger.getEffectiveLevel() <= logging.DEBUG:  # pragma: no cover
+    if logger.isEnabledFor(logging.DEBUG):  # pragma: no cover
         logger.debug(
             f"CPU: {name}: {x.flatten() if x.size < 40 else x.flatten()[:40]} {x.shape}"
         )
+
+
+def _log_progress(start_time, prev_time, iters, niters, pr, last_mem):
+    """Logging of progress."""
+    if not logger.isEnabledFor(logging.INFO):
+        return prev_time, last_mem
+
+    t = time.time()
+    lapsed = datetime.timedelta(seconds=(t - prev_time))
+    total = datetime.timedelta(seconds=(t - start_time))
+    per_iter = total / iters
+    expected = per_iter * niters
+
+    rss = pr.memory_info().rss
+    mem = human_readable_size(rss)
+    memdiff = human_readable_size(rss - last_mem, indicate_sign=True)
+
+    logger.info(
+        f"""
+        Progress Info   [{iters}/{niters} times ({100 * iters / niters:.1f}%)]
+            -> Update Time:   {lapsed}
+            -> Total Time:    {total} [{per_iter} per integration]
+            -> Expected Time: {expected} [{expected - total} remaining]
+            -> Memory Usage:  {mem}  [{memdiff}]
+        """
+    )
+
+    return t, rss
diff --git a/src/vis_cpu/gpu.py b/src/vis_cpu/gpu.py
index 3ba0d28..06c47ba 100644
--- a/src/vis_cpu/gpu.py
+++ b/src/vis_cpu/gpu.py
@@ -3,15 +3,24 @@
 
 import logging
 import numpy as np
+import psutil
+import time
 import warnings
 from astropy.constants import c as speed_of_light
+from collections.abc import Sequence
 from pathlib import Path
 from pyuvdata import UVBeam
-from typing import Callable, Optional, Sequence
+from typing import Callable, Optional
 
 from . import conversions
 from ._uvbeam_to_raw import uvbeam_to_azza_grid
-from .cpu import _evaluate_beam_cpu, _validate_inputs, _wrangle_beams, vis_cpu
+from .cpu import (
+    _evaluate_beam_cpu,
+    _log_progress,
+    _validate_inputs,
+    _wrangle_beams,
+    vis_cpu,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -49,7 +58,7 @@
 except Exception as e:  # pragma: no cover
     # if installed but having initialization issues
     # warn, but default back to non-gpu functionality
-    warnings.warn(str(e))
+    warnings.warn(str(e), stacklevel=2)
     HAVE_CUDA = False
     Template = no_op
 
@@ -112,13 +121,15 @@ def vis_gpu(
     if not HAVE_CUDA:
         raise ImportError("You need to install the [gpu] extra to use this function!")
 
+    pr = psutil.Process()
     nax, nfeed, nant, ntimes = _validate_inputs(
         precision, polarized, antpos, eq2tops, crd_eq, I_sky
     )
 
     if beam_spline_opts:
         warnings.warn(
-            "You have passed beam_spline_opts, but these are not used in GPU."
+            "You have passed beam_spline_opts, but these are not used in GPU.",
+            stacklevel=1,
         )
 
     nsrc = len(I_sky)
@@ -277,7 +288,7 @@ def vis_gpu(
 
     # output CPU buffers for downloading answers
     vis_cpus = [
-        np.empty(shape=(nfeed * nant, nfeed * nant), dtype=complex_dtype)
+        np.full((nfeed * nant, nfeed * nant), 0.0, dtype=complex_dtype)
         for _ in range(chunk)
     ]
     streams = [driver.Stream() for _ in range(chunk)]
@@ -294,10 +305,16 @@ def vis_gpu(
     if use_uvbeam:
         event_order.insert(4, "interpolation")
 
-    vis = np.empty((ntimes, nfeed * nant, nfeed * nant), dtype=complex_dtype)
+    vis = np.full((ntimes, nfeed * nant, nfeed * nant), 0.0, dtype=complex_dtype)
 
     logger.info("Running With %s chunks: ", chunk)
 
+    report_chunk = ntimes // 100 + 1
+    pr = psutil.Process()
+    tstart = time.time()
+    mlast = pr.memory_info().rss
+    plast = tstart
+
     for t in range(ntimes):
         eq2top_gpu.set(eq2tops[t])  # defines sky orientation for this time step
         events = [{e: driver.Event() for e in event_order} for _ in range(chunk)]
@@ -409,7 +426,7 @@ def vis_gpu(
                     int(np.ceil(nfeed / float(meas_block[2]))),
                 )
 
-                logger.info(f"Measurement Eq. Grid Size: {grid}")
+                logger.debug(f"Measurement Eq. Grid Size: {grid}")
 
                 _logdebug(A_gpu, "Beam")
 
@@ -470,6 +487,9 @@ def vis_gpu(
         events[chunk - 1]["end"].synchronize()
         vis[t] = sum(vis_cpus)
 
+        if not (t % report_chunk or t == ntimes - 1):
+            plast, mlast = _log_progress(tstart, plast, t + 1, ntimes, pr, mlast)
+
     # teardown GPU configuration
     cublasDestroy(h)
     vis = vis.conj().reshape((ntimes, nfeed, nant, nfeed, nant))
diff --git a/src/vis_cpu/plot.py b/src/vis_cpu/plot.py
index 8c48b2e..52b8384 100644
--- a/src/vis_cpu/plot.py
+++ b/src/vis_cpu/plot.py
@@ -42,7 +42,7 @@ def _source_az_za_beam(
 
     # Get beam values
     interp_beam = beam.interp(az, za, np.atleast_1d(ref_freq))[0]
-    A_s = interp_beam[0, 0, 1, 0]  # (2, 1, 2, 1, Nptsrc)
+    A_s = interp_beam[0, 1, 0]  # (2, 2, 1, Nptsrc)
 
     # Horizon cut
     A_s = np.where(tz > 0, A_s, np.nan)
diff --git a/tests/__init__.py b/tests/__init__.py
index 4235c57..155bcce 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,5 +1,6 @@
 """Tests."""
 import numpy as np
+from astropy import units as un
 from astropy.coordinates import EarthLocation, Latitude, Longitude
 from astropy.time import Time
 from astropy.units import Quantity
@@ -139,7 +140,7 @@ def get_standard_sim_params(
         dec=Latitude(ra_dec[:, 1], "rad"),
         spectral_type="spectral_index",
         spectral_index=sources[:, 3],
-        stokes=stokes,
+        stokes=stokes * un.Jy,
         reference_frequency=Quantity(reference_frequency, "Hz"),
     )
 
diff --git a/tests/test_beam_interp_gpu.py b/tests/test_beam_interp_gpu.py
index 6ebeb0e..b55a3a3 100644
--- a/tests/test_beam_interp_gpu.py
+++ b/tests/test_beam_interp_gpu.py
@@ -178,10 +178,10 @@ def test_non_identity_beamfile(polarized):
             for ibeam, bm in enumerate(feed):
                 print(iax, ifd, ibeam)
                 np.testing.assert_allclose(
-                    new_beam_uvb[iax, 0, ifd, 0].real, bm.real, rtol=1e-6
+                    new_beam_uvb[iax, ifd, 0].real, bm.real, rtol=1e-6
                 )
                 np.testing.assert_allclose(
-                    new_beam_uvb[iax, 0, ifd, 0].imag, bm.imag, rtol=1e-6
+                    new_beam_uvb[iax, ifd, 0].imag, bm.imag, rtol=1e-6
                 )
 
 
diff --git a/tests/test_beams.py b/tests/test_beams.py
index bb9c01d..b912a87 100644
--- a/tests/test_beams.py
+++ b/tests/test_beams.py
@@ -160,7 +160,7 @@ def freq() -> np.ndarray:
 
 
 @pytest.fixture(scope="function")
-def beam_list_unpol() -> List[EllipticalBeam]:
+def beam_list_unpol() -> list[EllipticalBeam]:
     """Get Gaussian beam and transform into an elliptical version."""
     base_beam = AnalyticBeam("gaussian", diameter=14.0)
     beam_analytic = EllipticalBeam(base_beam, xstretch=2.2, ystretch=1.0, rotation=40.0)
@@ -172,7 +172,7 @@ def beam_list_unpol() -> List[EllipticalBeam]:
 
 
 @pytest.fixture(scope="function")
-def beam_list_pol() -> List[EllipticalBeam]:
+def beam_list_pol() -> list[EllipticalBeam]:
     """Get Gaussian beam and transform into an elliptical version with polarization."""
     base_beam = AnalyticBeam("gaussian", diameter=14.0)
     beam_analytic = EllipticalBeam(base_beam, xstretch=2.2, ystretch=1.0, rotation=40.0)
@@ -418,8 +418,10 @@ def test_covers_sky_almost_strong(uvbeam):
     beam1 = uvbeam.copy()
     beam2 = uvbeam.copy()
 
+    # Restrict to a certain frequency
     beam1.data_array = beam1.data_array[:, :, :, [0]]
     beam2.data_array = beam2.data_array[:, :, :, [0]]
+
     beam1.Nfreqs = 1
     beam2.Nfreqs = 1
 
diff --git a/tests/test_compare_pyuvsim.py b/tests/test_compare_pyuvsim.py
index 04c45fe..e7faf37 100644
--- a/tests/test_compare_pyuvsim.py
+++ b/tests/test_compare_pyuvsim.py
@@ -70,7 +70,6 @@ def test_compare_pyuvsim(polarized, use_analytic_beam):
         for j in range(i, nants):
             for if1, feed1 in enumerate(("X", "Y") if polarized else ("X",)):
                 for if2, feed2 in enumerate(("X", "Y") if polarized else ("X",)):
-
                     d_uvsim = uvd_uvsim.get_data(
                         (i, j, feed1 + feed2)
                     ).T  # pyuvsim visibility
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index 858c226..b4ce2e8 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -82,7 +82,6 @@ def test_equatorial_to_enu():
 
     # Loop over LSTs
     for i, lst in enumerate(lsts):
-
         # Rotation matrices from ECI <-> ENU
         mat_eci_to_enu = conversions.eci_to_enu_matrix(lst, lat=hera_lat)
         mat_enu_to_eci = conversions.enu_to_eci_matrix(lst, lat=hera_lat)
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..d7808d6
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,20 @@
+"""Test the utils module."""
+from vis_cpu import _utils
+
+
+def test_human_readable_size():
+    """Test the human_readable_size function."""
+    assert _utils.human_readable_size(0) == "0.00 B"
+    assert _utils.human_readable_size(1) == "1.00 B"
+    assert _utils.human_readable_size(1023) == "1023.00 B"
+    assert _utils.human_readable_size(1024) == "1.00 KiB"
+    assert _utils.human_readable_size(1024**2) == "1.00 MiB"
+    assert _utils.human_readable_size(1024**3) == "1.00 GiB"
+    assert _utils.human_readable_size(1024**4) == "1.00 TiB"
+    assert _utils.human_readable_size(1024**5) == "1.00 PiB"
+    assert _utils.human_readable_size(1024**6) == "1024.00 PiB"
+    assert _utils.human_readable_size(1024**6, decimal_places=3) == "1024.000 PiB"
+    assert (
+        _utils.human_readable_size(1024**6, decimal_places=3, indicate_sign=True)
+        == "+1024.000 PiB"
+    )
diff --git a/tests/test_vis_cpu.py b/tests/test_vis_cpu.py
index fade812..6291026 100644
--- a/tests/test_vis_cpu.py
+++ b/tests/test_vis_cpu.py
@@ -49,5 +49,6 @@ def test_simulate_vis(polarized):
         polarized=polarized,
         precision=1,
         latitude=-30.7215 * np.pi / 180.0,
+        max_progress_reports=2,
     )
     assert np.all(~np.isnan(vis))  # check that there are no NaN values