diff --git a/src/batched/dense/KokkosBatched_SVD_Decl.hpp b/src/batched/dense/KokkosBatched_SVD_Decl.hpp
new file mode 100644
index 0000000000..f727d00a35
--- /dev/null
+++ b/src/batched/dense/KokkosBatched_SVD_Decl.hpp
@@ -0,0 +1,66 @@
+#ifndef __KOKKOSBATCHED_SVD_DECL_HPP__
+#define __KOKKOSBATCHED_SVD_DECL_HPP__
+
+/// \author Brian Kelley (bmkelle@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
+namespace KokkosBatched {
+
+  /// Given a general matrix A (m x n), compute the full singular value decomposition (SVD):
+  /// U * diag(s) * V^T = A. U/V are orthogonal and s contains nonnegative values in descending order.
+  ///
+  /// Currently only supports real-valued matrices.
+  /// 
+  /// Parameters:
+  ///   [in] A
+  ///     General matrix (rank 2 view), m x n.
+  ///     The contents of A are overwritten and undefined after calling this function.
+  ///   [out] U
+  ///     m left singular vectors (in columns). Dimensions m*m.
+  ///   [out] Vt
+  ///     n right singular vectors (in rows). Dimensions n*n.
+  ///   [out] s
+  ///     min(m, n) singular values.
+  ///   [in] W
+  ///     1D contiguous workspace. The required size is max(m, n).
+  ///
+  /// Preconditions:
+  ///   m == A.extent(0) == U.extent(0) == U.extent(1)
+  ///   n == A.extent(1) == V.extent(0) == V.extent(1)
+  ///   min(m, n) == s.extent(0)
+  ///   W.extent(0) >= max(m, n)
+  ///   W.stride(0) == 1 (contiguous)
+
+  struct SVD_USV_Tag {};
+  struct SVD_S_Tag {};
+  // Note: Could easily add SV or US tags later if needed
+
+  struct SerialSVD {
+    //Version to compute full factorization: A == U * diag(s) * Vt
+    template<typename AViewType,
+             typename UViewType,
+             typename VtViewType,
+             typename SViewType,
+             typename WViewType>
+    KOKKOS_INLINE_FUNCTION
+    static int
+    invoke(SVD_USV_Tag, const AViewType &A,
+           const UViewType &U, const SViewType &s,
+           const VtViewType &Vt, const WViewType &W);
+
+    //Version which computes only singular values
+    template<typename AViewType,
+             typename SViewType,
+             typename WViewType>
+    KOKKOS_INLINE_FUNCTION
+    static int
+    invoke(SVD_S_Tag, const AViewType &A, const SViewType &s, const WViewType &W);
+  };
+
+} /// end namespace KokkosBatched
+
+#include "KokkosBatched_SVD_Serial_Impl.hpp"
+
+#endif
diff --git a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp
new file mode 100644
index 0000000000..cd943e71b9
--- /dev/null
+++ b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp
@@ -0,0 +1,51 @@
+#ifndef __KOKKOSBATCHED_SVD_SERIAL_IMPL_HPP__
+#define __KOKKOSBATCHED_SVD_SERIAL_IMPL_HPP__
+
+/// \author Brian Kelley (bmkelle@sandia.gov)
+
+#include "KokkosBatched_SVD_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  //Version which computes the full factorization
+  template<typename AViewType,
+           typename UViewType,
+           typename VViewType,
+           typename SViewType,
+           typename WViewType>
+  KOKKOS_INLINE_FUNCTION
+  int SerialSVD::
+  invoke(SVD_USV_Tag, const AViewType &A,
+         const UViewType &U, const SViewType &sigma,
+         const VViewType &Vt, const WViewType &work)
+  {
+    using value_type = typename AViewType::non_const_value_type;
+    return KokkosBatched::SerialSVDInternal::invoke<value_type>
+      (A.extent(0), A.extent(1),
+       A.data(), A.stride(0), A.stride(1),
+       U.data(), U.stride(0), U.stride(1),
+       Vt.data(), Vt.stride(0), Vt.stride(1),
+       sigma.data(), sigma.stride(0),
+       work.data());
+  }
+
+  //Version which computes only singular values
+  template<typename AViewType,
+           typename SViewType,
+           typename WViewType>
+  KOKKOS_INLINE_FUNCTION
+  int SerialSVD::
+  invoke(SVD_S_Tag, const AViewType &A, const SViewType &sigma, const WViewType &work)
+  {
+    using value_type = typename AViewType::non_const_value_type;
+    return KokkosBatched::SerialSVDInternal::invoke<value_type>
+      (A.extent(0), A.extent(1),
+       A.data(), A.stride(0), A.stride(1),
+       nullptr, 0, 0,
+       nullptr, 0, 0,
+       sigma.data(), sigma.stride(0),
+       work.data());
+  }
+
+} /// end namespace KokkosBatched
+
+#endif
diff --git a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp
new file mode 100644
index 0000000000..1a5ca961b6
--- /dev/null
+++ b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp
@@ -0,0 +1,355 @@
+#ifndef __KOKKOSBATCHED_SVD_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_SVD_SERIAL_INTERNAL_HPP__
+
+
+/// \author Brian Kelley (bmkelle@sandia.gov)
+
+#include "Kokkos_MathematicalFunctions.hpp"
+#include "KokkosBatched_SetIdentity_Internal.hpp"
+#include "KokkosBatched_Givens_Serial_Internal.hpp"
+#include "KokkosBatched_ApplyGivens_Serial_Internal.hpp"
+#include "KokkosBatched_Householder_Serial_Internal.hpp"
+#include "KokkosBatched_ApplyHouseholder_Serial_Internal.hpp"
+
+//Use this macro to handle raw pointer/stride based 2D indexing in this file (just for readability)
+//Requires that for pointer X, the corresponding row/col strides are named Xs0 and Xs1.
+#define SVDIND(arr, i, j) arr[(i) * arr##s0 + (j) * arr##s1]
+#define SVDSWAP(a, b) {auto tmp = a; a = b; b = tmp;}
+
+namespace KokkosBatched {
+
+  ///
+  /// Serial Internal Impl
+  /// ==================== 
+
+  struct SerialSVDInternal {
+
+    //Find the two eigenvalues of [a11 a21 ; a21 a22] by solving the characteristic quadratic.
+    //Since matrix is symmetric these will be real.
+    //NOTE: this is essentially the Wilkinson shift routine already in Batched,
+    //however this is simpler because it exploits the symmetric structure, and the realness of the eigenvalues.
+    template<typename value_type>
+    KOKKOS_INLINE_FUNCTION static void symEigen2x2(value_type a11, value_type a21, value_type a22, value_type& e1, value_type& e2)
+    {
+      value_type a = Kokkos::ArithTraits<value_type>::one();
+      value_type b = -a11 - a22;
+      value_type c = a11 * a22 - a21 * a21;
+      value_type sqrtDet = Kokkos::Experimental::sqrt(b * b - 4 * a * c);
+      e1 = (-b + sqrtDet) / (2 * a);
+      e2 = (-b - sqrtDet) / (2 * a);
+    }
+
+    // B is a square submatrix on the diagonal.
+    // Usub is a subset of columns of U
+    // Vtsub is a subset of rows of Vt
+    //
+    // B22 is nsub * nsub, Usub is m * nsub, and Vtsub is nsub * n
+    template<typename value_type>
+    KOKKOS_INLINE_FUNCTION static void svdStep(value_type* B, value_type* U, value_type* Vt, int um, int vn, int n, int Bs0, int Bs1, int Us0, int Us1, int Vts0, int Vts1)
+    {
+      using KAT = Kokkos::ArithTraits<value_type>;
+      //Compute the eigenvalues of trailing 2x2
+      value_type dn = SVDIND(B, n-1, n-1);
+      value_type dm = SVDIND(B, n-2, n-2);
+      value_type fm = SVDIND(B, n-2, n-1);
+      value_type fmm1 = (n > 2) ? SVDIND(B, n-3, n-2) : KAT::zero();
+      value_type target = dn * dn + fm * fm;
+      value_type e1, e2, mu;
+      symEigen2x2(dm * dm + fmm1 * fmm1, dm * fm, target, e1, e2);
+      //the shift is the eigenvalue closer to the last diagonal entry of B^T*B
+      if(fabs(e1 - target) < fabs(e2 - target))
+        mu = e1;
+      else
+        mu = e2;
+      value_type y = SVDIND(B, 0, 0) * SVDIND(B, 0, 0) - mu;
+      value_type z = SVDIND(B, 0, 0) * SVDIND(B, 0, 1);
+      for(int k = 0; k < n - 1; k++)
+      {
+        //Use Givens to zero out z in [y; z]
+        Kokkos::pair<value_type, value_type> G;
+        value_type discard; //Don't actually write [alpha; 0] anywhere
+        KokkosBatched::SerialGivensInternal::invoke<value_type>(y, z, &G, &discard);
+        //apply the Givens transformation to B on the right, to columns k,k+1
+        //B := BG(k, k+1, theta)
+        int minrow = KOKKOSKERNELS_MACRO_MAX(0, k - 1);
+        int maxrow = KOKKOSKERNELS_MACRO_MIN(n, k + 2);
+        KokkosBatched::SerialApplyRightGivensInternal::invoke<value_type>(G, maxrow - minrow, &SVDIND(B, minrow, k + 1), Bs0, &SVDIND(B, minrow, k), Bs0);
+        if(Vt)
+        {
+          KokkosBatched::SerialApplyLeftGivensInternal::invoke<value_type>(G, vn, &SVDIND(Vt, k + 1, 0), Vts1, &SVDIND(Vt, k, 0), Vts1);
+        }
+        y = SVDIND(B, k, k);
+        z = SVDIND(B, k + 1, k);
+        KokkosBatched::SerialGivensInternal::invoke<value_type>(y, z, &G, &SVDIND(B, k, k));
+        SVDIND(B, k + 1, k) = KAT::zero();
+        int mincol = k + 1;
+        int maxcol = KOKKOSKERNELS_MACRO_MIN(n, k + 3);
+        //apply Givens transformation to B on the left, to rows k, k + 1
+        //B := G(k, k+1, theta)^T * B
+        KokkosBatched::SerialApplyLeftGivensInternal::invoke<value_type>(G, maxcol - mincol, &SVDIND(B, k + 1, mincol), Bs1, &SVDIND(B, k, mincol), Bs1);
+        if(U)
+        {
+          KokkosBatched::SerialApplyRightGivensInternal::invoke<value_type>(G, um, &SVDIND(U, 0, k + 1), Us0, &SVDIND(U, 0, k), Us0);
+        }
+        if(k < n - 2)
+        {
+          y = SVDIND(B, k, k + 1);
+          z = SVDIND(B, k, k + 2);
+        }
+      }
+    }
+
+    //Deal with B(i, i) = 0, by chasing superdiagonal nonzero across row i.
+    //Assumes i is not the last row.
+    //U is m*m, B is n*n
+    template<typename value_type>
+    KOKKOS_INLINE_FUNCTION static void svdZeroRow(int i, value_type* B, int n, int Bs0, int Bs1, value_type* U, int m, int Us0, int Us1)
+    {
+      Kokkos::pair<value_type, value_type> G;
+      for(int j = i + 1; j < n; j++)
+      {
+        //Zero out B(i, j) against diagonal j, introducing nonzero in B(i, j + 1)
+        KokkosBatched::SerialGivensInternal::invoke<value_type>(SVDIND(B, j, j), SVDIND(B, i, j), &G, &SVDIND(B, j, j));
+        SVDIND(B, i, j) = Kokkos::ArithTraits<value_type>::zero();
+        //Now, only need to apply givens to a single column (if not already at the end),
+        //introducing the next nonzero
+        if(j < n - 1)
+        {
+          KokkosBatched::SerialApplyLeftGivensInternal::invoke<value_type>(G, 1, &SVDIND(B, i, j + 1), Bs1, &SVDIND(B, j, j + 1), Bs1);
+        }
+        if(U)
+        {
+          KokkosBatched::SerialApplyRightGivensInternal::invoke<value_type>(G, m, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, j), Us0);
+        }
+      }
+    }
+
+    template<typename value_type>
+    KOKKOS_INLINE_FUNCTION static void svdZeroLastColumn(value_type* B, int n, int Bs0, int Bs1, value_type* Vt, int Vts0, int Vts1)
+    {
+      //Deal with B(n-1, n-1) = 0, by chasing the superdiagonal nonzero up the last column.
+      Kokkos::pair<value_type, value_type> G;
+      for(int j = n - 2; j >= 0; j--)
+      {
+        KokkosBatched::SerialGivensInternal::invoke<value_type>(SVDIND(B, j, j), SVDIND(B, j, n - 1), &G, &SVDIND(B, j, j));
+        SVDIND(B, j, n - 1) = Kokkos::ArithTraits<value_type>::zero();
+        if(j != 0)
+        {
+          KokkosBatched::SerialApplyRightGivensInternal::invoke<value_type>(G, 1, &SVDIND(B, j - 1, n - 1), Bs0, &SVDIND(B, j - 1, j), Bs0);
+        }
+        if(Vt)
+        {
+          KokkosBatched::SerialApplyLeftGivensInternal::invoke<value_type>(G, n, &SVDIND(Vt, n - 1, 0), Vts1, &SVDIND(Vt, j, 0), Vts1);
+        }
+      }
+    }
+
+    template<typename value_type>
+    KOKKOS_INLINE_FUNCTION static void bidiagonalize(int m, int n, value_type* A, int As0, int As1, value_type* U, int Us0, int Us1, value_type* Vt, int Vts0, int Vts1, value_type* work)
+    {
+      using KAT = Kokkos::ArithTraits<value_type>;
+      value_type tau;
+      for(int i = 0; i < n; i++)
+      {
+        //Eliminating column i of A below the diagonal
+        KokkosBatched::SerialLeftHouseholderInternal::invoke<value_type>(m - i - 1, &SVDIND(A, i, i), &SVDIND(A, i + 1, i), As0, &tau);
+        if(n - i > 1)
+        {
+          KokkosBatched::SerialApplyLeftHouseholderInternal::invoke<value_type>(m - i - 1, n - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(A, i, i + 1), As1, &SVDIND(A, i + 1, i + 1), As0, As1, work);
+        }
+        if(U)
+        {
+          KokkosBatched::SerialApplyRightHouseholderInternal::invoke<value_type>(m, m - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, i + 1), Us0, Us1, work);
+        }
+        //Zero out A subdiag explicitly (NOTE: may not be necessary...)
+        for(int j = i + 1; j < m; j++)
+        {
+          SVDIND(A, j, i) = KAT::zero();
+        }
+        if(i < n - 2)
+        {
+          //Eliminating row i of A to the right of the 1st superdiagonal
+          KokkosBatched::SerialLeftHouseholderInternal::invoke<value_type>(n - i - 2, &SVDIND(A, i, i + 1), &SVDIND(A, i, i + 2), As1, &tau);
+          if(m - i > 1)
+          {
+            KokkosBatched::SerialApplyRightHouseholderInternal::invoke<value_type>(m - i - 1, n - i - 2, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(A, i + 1, i + 1), As0, &SVDIND(A, i + 1, i + 2), As0, As1, work);
+          }
+          if(Vt)
+          {
+            KokkosBatched::SerialApplyLeftHouseholderInternal::invoke<value_type>(n - i - 2, n, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(Vt, i + 1, 0), Vts1, &SVDIND(Vt, i + 2, 0), Vts0, Vts1, work);
+          }
+          //Zero out A superdiag row explicitly
+          for(int j = i + 2; j < n; j++)
+          {
+            SVDIND(A, i, j) = KAT::zero();
+          }
+        }
+      }
+    }
+
+    //Compute the SVD of a bidiagonal matrix B. Apply inverse transformations to U and Vt to maintain the product U*B*Vt.
+    //At the end, the singular values are copied to sigma.
+    template<typename value_type>
+    KOKKOS_INLINE_FUNCTION static void bidiSVD(int m, int n, value_type* B, int Bs0, int Bs1, value_type* U, int Us0, int Us1, value_type* Vt, int Vts0, int Vts1, value_type* sigma, int ss)
+    {
+      using KAT = Kokkos::ArithTraits<value_type>;
+      const value_type eps = Kokkos::ArithTraits<value_type>::epsilon();
+      int p = 0;
+      int q = 0;
+      while(true)
+      {
+        //Zero out tiny superdiagonal entries
+        for(int i = 0; i < n - 1; i++)
+        {
+          if(fabs(SVDIND(B, i, i + 1)) < eps * (fabs(SVDIND(B, i, i)) + fabs(SVDIND(B, i + 1, i + 1))))
+          {
+            SVDIND(B, i, i + 1) = KAT::zero();
+          }
+        }
+        //Find q: first column from the end with nonzero superdiagonal.
+        //If no such columns, will be 0.
+        for(q = n - 1; q > 0; q--)
+        {
+          if(SVDIND(B, q - 1, q) != KAT::zero())
+            break;
+        }
+        if(q == 0)
+        {
+          //B is completely diagonal, so it contains singular values and we are done.
+          break;
+        }
+        q++;
+        //now, q is the upper (exclusive) bound of submatrix on which to do SVD step.
+        //Find min p, so that [p, q) x [p, q) submatrix has all nonzero superdiagonals.
+        for(p = q - 1; p > 0; p--)
+        {
+          if(SVDIND(B, p - 1, p) == KAT::zero())
+            break;
+        }
+        //If there are zero diagonals in this range, eliminate the entire row
+        //(effectively decoupling into two subproblems)
+        for(int i = q - 1; i >= p; i--)
+        {
+          if(SVDIND(B, i, i) == KAT::zero())
+          {
+            if(i == n - 1)
+            {
+              //Last diagonal entry being 0 is a special case.
+              //Zero out the superdiagonal above it.
+              //Deal with B(n-1, n-1) = 0, by chasing the superdiagonal nonzero up the last column.
+              svdZeroLastColumn(B, n, Bs0, Bs1, Vt, Vts0, Vts1);
+            }
+            else if(SVDIND(B, i, i + 1) != KAT::zero())
+            {
+              svdZeroRow(i, B, n, Bs0, Bs1, U, m, Us0, Us1);
+            }
+          } 
+          continue;
+        }
+        int nsub = q - p;
+        //B22 is nsub * nsub, Usub is m * nsub, and Vtsub is nsub * n
+        svdStep(&SVDIND(B, p, p), &SVDIND(U, 0, p), &SVDIND(Vt, p, 0), m, n, nsub, Bs0, Bs1, Us0, Us1, Vts0, Vts1);
+      }
+      for(int i = 0; i < n; i++)
+      {
+        sigma[i * ss] = SVDIND(B, i, i);
+      }
+    }
+
+    //Convert SVD into conventional form: singular values positive and in descending order
+    template<typename value_type>
+    KOKKOS_INLINE_FUNCTION static void postprocessSVD(int m, int n, value_type* U, int Us0, int Us1, value_type* Vt, int Vts0, int Vts1, value_type* sigma, int ss)
+    {
+      //First step: flip signs on negative singular values
+      for(int i = 0; i < n; i++)
+      {
+        if(sigma[i * ss] < 0)
+        {
+          sigma[i * ss] = -sigma[i * ss];
+          if(Vt)
+          {
+            for(int j = 0; j < n; j++)
+              SVDIND(Vt, i, j) = -SVDIND(Vt, i, j);
+          }
+        }
+      }
+      //Second step: stable selection sort to put singular values in order.
+      //Using selection sort because the quadratic part only applies to sigma (O(n^2) total), and it minimizes column swaps in U,V (O(mn) total movement).
+      for(int i = 0; i < n - 1; i++)
+      {
+        //find the proper singular value to go in position i
+        value_type maxval = sigma[i * ss];
+        int maxloc = i;
+        for(int j = i + 1; j < n; j++)
+        {
+          if(sigma[j * ss] > maxval)
+          {
+            maxval = sigma[j * ss];
+            maxloc = j;
+          }
+        }
+        //swap singular values and U/V columns i and maxloc (if maxloc is not already in the right place)
+        if(i != maxloc)
+        {
+          SVDSWAP(sigma[i * ss], sigma[maxloc * ss]);
+          if(U)
+          {
+            for(int j = 0; j < m; j++)
+              SVDSWAP(SVDIND(U, j, i), SVDIND(U, j, maxloc))
+          }
+          if(Vt)
+          {
+            for(int j = 0; j < n; j++)
+              SVDSWAP(SVDIND(Vt, i, j), SVDIND(Vt, maxloc, j))
+          }
+        }
+      }
+    }
+
+    template<typename value_type>
+    KOKKOS_INLINE_FUNCTION static int
+    invoke(int m, int n,
+        value_type* A, int As0, int As1,
+        value_type* U, int Us0, int Us1,
+        value_type* Vt, int Vts0, int Vts1,
+        value_type* sigma, int ss,
+        value_type* work)
+    {
+      //First, if m < n, need to instead compute (V, s, U^T) = A^T.
+      //This just means swapping U & Vt, and implicitly transposing A, U and Vt.
+      if(m < n)
+      {
+        //Transpose A
+        SVDSWAP(m, n);
+        SVDSWAP(As0, As1);
+        //Transpose and swap U, Vt
+        SVDSWAP(U, Vt);
+        SVDSWAP(Us0, Vts1);
+        SVDSWAP(Us1, Vts0);
+      }
+      if(U)
+      {
+        KokkosBatched::SerialSetIdentityInternal::invoke<value_type>(m, m, U, Us0, Us1);
+      }
+      if(Vt)
+      {
+        KokkosBatched::SerialSetIdentityInternal::invoke<value_type>(n, n, Vt, Vts0, Vts1);
+      }
+      if(m == 0 || n == 0)
+      {
+        //sigma is length 0, so there's nothing left to compute
+        return 0;
+      }
+      bidiagonalize(m, n, A, As0, As1, U, Us0, Us1, Vt, Vts0, Vts1, work);
+      bidiSVD(m, n, A, As0, As1, U, Us0, Us1, Vt, Vts0, Vts1, sigma, ss);
+      postprocessSVD(m, n, U, Us0, Us1, Vt, Vts0, Vts1, sigma, ss);
+      return 0;
+    }
+  };
+
+} /// end namespace KokkosBatched
+
+#undef SVDIND
+#undef SVDSWAP
+
+#endif
diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp
index 3e4344ea84..d472e2cee9 100644
--- a/test_common/KokkosKernels_TestUtils.hpp
+++ b/test_common/KokkosKernels_TestUtils.hpp
@@ -243,6 +243,31 @@ struct Functor_BatchedVanillaGEMM {
   }
 };
 
+//Compute C := alpha * AB + beta * C
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC>
+void vanillaGEMM(typename ViewTypeC::non_const_value_type alpha,
+                 const ViewTypeA& A, const ViewTypeB& B,
+                 typename ViewTypeC::non_const_value_type beta,
+                 const ViewTypeC& C) {
+  using value_type = typename ViewTypeC::non_const_value_type;
+  using KAT = Kokkos::ArithTraits<value_type>;
+  int m = A.extent(0);
+  int k = A.extent(1);
+  int n = B.extent(1);
+  for(int i = 0; i < m; i++)
+  {
+    for(int j = 0; j < n; j++)
+    {
+      value_type sum = KAT::zero();
+      for(int ii = 0; ii < k; ii++)
+      {
+        sum += A(i, ii) * B(ii, j);
+      }
+      C(i, j) = alpha * sum + beta * C(i, j);
+    }
+  }
+}
+
 template <class ViewTypeA, class ViewTypeX, class ViewTypeY>
 void vanillaGEMV(char mode, typename ViewTypeA::non_const_value_type alpha,
                  const ViewTypeA& A, const ViewTypeX& x,
diff --git a/unit_test/batched/dense/Test_Batched_Dense.hpp b/unit_test/batched/dense/Test_Batched_Dense.hpp
index 0541b88d7e..60dcda20cc 100644
--- a/unit_test/batched/dense/Test_Batched_Dense.hpp
+++ b/unit_test/batched/dense/Test_Batched_Dense.hpp
@@ -37,6 +37,7 @@
 #include "Test_Batched_SerialTrtri.hpp"
 #include "Test_Batched_SerialTrtri_Real.hpp"
 #include "Test_Batched_SerialTrtri_Complex.hpp"
+#include "Test_Batched_SerialSVD.hpp"
 
 // Team Kernels
 #include "Test_Batched_TeamGemm.hpp"
diff --git a/unit_test/batched/dense/Test_Batched_SerialSVD.hpp b/unit_test/batched/dense/Test_Batched_SerialSVD.hpp
new file mode 100644
index 0000000000..fbdfbc207a
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_SerialSVD.hpp
@@ -0,0 +1,409 @@
+/// \author Brian Kelley (bmkelle@sandia.gov)
+
+#include "KokkosBatched_SVD_Decl.hpp"             //For testing overall kernel
+#include "KokkosBatched_SVD_Serial_Internal.hpp"  //For unit testing individual components
+#include "KokkosBatched_SetIdentity_Decl.hpp"
+
+namespace Test
+{
+  template<typename Scalar>
+  Scalar svdEpsilon()
+  {throw std::runtime_error("Unsupported scalar type");}
+
+  template<>
+  double svdEpsilon()
+  {return 1e-13;}
+
+  template<>
+  float svdEpsilon()
+  {return 2e-6f;}
+}
+
+template<typename Vector>
+double simpleNorm2(const Vector& v)
+{
+  using Scalar = typename Vector::non_const_value_type;
+  using KAT = Kokkos::ArithTraits<Scalar>;
+  auto vhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v);
+  double d = 0;
+  for(size_t i = 0; i < v.extent(0); i++)
+  {
+    double m = KAT::abs(vhost(i));
+    d += m * m;
+  }
+  return Kokkos::Experimental::sqrt(d);
+}
+
+template<typename V1, typename V2>
+typename V1::non_const_value_type simpleDot(const V1& v1, const V2& v2)
+{
+  using Scalar = typename V1::non_const_value_type;
+  using KAT = Kokkos::ArithTraits<Scalar>;
+  auto v1host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v1);
+  auto v2host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v2);
+  typename V1::non_const_value_type val = KAT::zero();
+  for(size_t i = 0; i < v1.extent(0); i++)
+  {
+    val += v1host(i) * v2host(i);
+  }
+  return val;
+}
+
+//Check that all columns of X are unit length and pairwise orthogonal
+template<typename Mat>
+void verifyOrthogonal(const Mat& X)
+{
+  using value_type = typename Mat::non_const_value_type;
+  int k = X.extent(1);
+  for(int i = 0; i < k; i++)
+  {
+    auto col1 = Kokkos::subview(X, Kokkos::ALL(), i);
+    double len = simpleNorm2(col1);
+    Test::EXPECT_NEAR_KK(len, 1.0, Test::svdEpsilon<value_type>());
+    for(int j = 0; j < i; j++)
+    {
+      auto col2 = Kokkos::subview(X, Kokkos::ALL(), j);
+      double d = Kokkos::ArithTraits<value_type>::abs(simpleDot(col1, col2));
+      Test::EXPECT_NEAR_KK(d, 0.0, Test::svdEpsilon<value_type>());
+    }
+  }
+}
+
+template<typename AView, typename UView, typename VtView, typename SigmaView>
+void verifySVD(const AView& A, const UView& U, const VtView& Vt, const SigmaView& sigma)
+{
+  using value_type = typename AView::non_const_value_type;
+  using KAT = Kokkos::ArithTraits<value_type>;
+  //Check that U/V columns are unit length and orthogonal, and that U * diag(sigma) * V^T == A
+  int m = A.extent(0);
+  int n = A.extent(1);
+  int maxrank = std::min(m, n);
+  verifyOrthogonal(U);
+  //NOTE: V^T being square and orthonormal implies that V is, so we don't have to transpose it here.
+  verifyOrthogonal(Vt);
+  AView usvt("USV^T", m, n);
+  for(int i = 0; i < maxrank; i++)
+  {
+    auto Ucol = Kokkos::subview(U, Kokkos::ALL(), Kokkos::make_pair<int>(i, i + 1));
+    auto Vtrow = Kokkos::subview(Vt, Kokkos::make_pair<int>(i, i + 1), Kokkos::ALL());
+    Test::vanillaGEMM(sigma(i), Ucol, Vtrow, 1.0, usvt);
+  }
+  for(int i = 0; i < m; i++)
+  {
+    for(int j = 0; j < n; j++)
+    {
+      Test::EXPECT_NEAR_KK(usvt(i, j), A(i, j), Test::svdEpsilon<value_type>());
+    }
+  }
+  //Make sure all singular values are positive
+  for(int i = 0; i < maxrank; i++)
+  {
+    EXPECT_GE(sigma(i), KAT::zero());
+  }
+  //Make sure singular values are in descending order
+  for(int i = 0; i < maxrank - 1; i++)
+  {
+    EXPECT_GE(sigma(i), sigma(i + 1));
+  }
+}
+
+template<typename Matrix>
+Matrix createRandomMatrix(int m, int n, int deficiency, double maxval = 1.0)
+{
+  using Scalar = typename Matrix::non_const_value_type;
+  Matrix mat("A", m, n);
+  auto mhost = Kokkos::create_mirror_view(mat);
+  //Fill mat with random values first
+  if(maxval != 0.0)
+  {
+    Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> rand_pool(13718);
+    Scalar minrand, maxrand;
+    Test::getRandomBounds<Scalar>(maxval, minrand, maxrand);
+    Kokkos::fill_random(mhost, rand_pool, minrand, maxrand);
+  }
+  //Apply the rank deficiency.
+  //If m < n, make some rows a multiple of the first row.
+  //Otherwise, make some columns a multiple of the first column.
+  if(m < n)
+  {
+    for(int i = 0; i < deficiency; i++)
+    {
+      //make row i + 1 a multiple of row 0
+      for(int j = 0; j < n; j++)
+      {
+        mhost(i + 1, j) = (double) (i + 2) * mhost(0, j);
+      }
+    }
+  }
+  else
+  {
+    for(int i = 0; i < deficiency; i++)
+    {
+      //make col i + 1 a multiple of col 0
+      for(int j = 0; j < m; j++)
+      {
+        mhost(j, i + 1) = (double) (i + 2) * mhost(j, 0);
+      }
+    }
+  }
+  Kokkos::deep_copy(mat, mhost);
+  return mat;
+}
+
+template<typename Matrix, typename Vector>
+struct SerialSVDFunctor_Full
+{
+  SerialSVDFunctor_Full(const Matrix& A_, const Matrix& U_, const Matrix& Vt_, const Vector& sigma_, const Vector& work_)
+    : A(A_), U(U_), Vt(Vt_), sigma(sigma_), work(work_)
+  {}
+
+  //NOTE: this functor is only meant to be launched with a single element range policy
+  KOKKOS_INLINE_FUNCTION void operator()(int) const
+  {
+    KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_USV_Tag(), A, U, sigma, Vt, work);
+  }
+
+  Matrix A;
+  Matrix U;
+  Matrix Vt;
+  Vector sigma;
+  Vector work;
+};
+
+template<typename Matrix, typename Vector>
+struct SerialSVDFunctor_SingularValuesOnly
+{
+  SerialSVDFunctor_SingularValuesOnly(const Matrix& A_, const Vector& sigma_, const Vector& work_)
+    : A(A_), sigma(sigma_), work(work_)
+  {}
+
+  //NOTE: this functor is only meant to be launched with a single element range policy
+  KOKKOS_INLINE_FUNCTION void operator()(int) const
+  {
+    KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_S_Tag(), A, sigma, work);
+  }
+
+  Matrix A;
+  Vector sigma;
+  Vector work;
+};
+
+template<typename Scalar, typename Layout, typename Device>
+void testSerialSVD(int m, int n, int deficiency, double maxval = 1.0)
+{
+  using Matrix = Kokkos::View<Scalar**, Layout, Device>;
+  using Vector = Kokkos::View<Scalar*, Device>;
+  using ExecSpace = typename Device::execution_space;
+  Matrix A = createRandomMatrix<Matrix>(m, n, deficiency, maxval);
+  //Fill U, Vt, sigma with nonzeros as well to make sure they are properly overwritten
+  Matrix U("U", m, m);
+  Matrix Vt("Vt", n, n);
+  int maxrank = std::min(m, n);
+  Vector sigma("sigma", maxrank);
+  Vector work("work", std::max(m, n));
+  Kokkos::deep_copy(U, -5.0);
+  Kokkos::deep_copy(Vt, -5.0);
+  Kokkos::deep_copy(sigma, -5.0);
+  Kokkos::deep_copy(work, -5.0);
+  //Make a copy of A (before SVD) for verification, since the original will be overwritten
+  typename Matrix::HostMirror Acopy("Acopy", m, n);
+  Kokkos::deep_copy(Acopy, A);
+  //Run the SVD
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 1),
+      SerialSVDFunctor_Full<Matrix, Vector>(A, U, Vt, sigma, work));
+  //Get the results back
+  auto Uhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), U);
+  auto Vthost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Vt);
+  auto sigmaHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma);
+  //Verify the SVD is correct
+  verifySVD(Acopy, Uhost, Vthost, sigmaHost);
+}
+
+template<typename Scalar, typename Layout, typename Device>
+void testSerialSVDSingularValuesOnly(int m, int n)
+{
+  using Matrix = Kokkos::View<Scalar**, Layout, Device>;
+  using Vector = Kokkos::View<Scalar*, Device>;
+  using ExecSpace = typename Device::execution_space;
+  Matrix A = createRandomMatrix<Matrix>(m, n, 0);
+  //Fill U, Vt, sigma with nonzeros as well to make sure they are properly overwritten
+  Matrix U("U", m, m);
+  Matrix Vt("Vt", n, n);
+  int maxrank = std::min(m, n);
+  Vector sigma1("sigma", maxrank);
+  Vector sigma2("sigma", maxrank);
+  Vector work("work", std::max(m, n));
+  Kokkos::deep_copy(U, -5.0);
+  Kokkos::deep_copy(Vt, -5.0);
+  Kokkos::deep_copy(sigma1, -5.0);
+  Kokkos::deep_copy(sigma2, -7.0);
+  Kokkos::deep_copy(work, -5.0);
+  //Make a copy of A (before SVD) for verification, since the original will be overwritten
+  typename Matrix::HostMirror Acopy("Acopy", m, n);
+  Kokkos::deep_copy(Acopy, A);
+  //Run the SVD (full mode)
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 1),
+      SerialSVDFunctor_Full<Matrix, Vector>(A, U, Vt, sigma1, work));
+  Kokkos::deep_copy(A, Acopy);
+  //Run the same SVD (singular values only mode)
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 1),
+      SerialSVDFunctor_SingularValuesOnly<Matrix, Vector>(A, sigma2, work));
+  auto sigma1Host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma1);
+  auto sigma2Host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma2);
+  //Make sure they match
+  for(int i = 0; i < maxrank; i++)
+  {
+    Test::EXPECT_NEAR_KK(sigma1Host(i), sigma2Host(i), Test::svdEpsilon<Scalar>());
+  }
+}
+
+//Test the bidiagonal n*n SVD step where the last diagonal entry is 0
+template<typename Scalar, typename Layout>
+void testSerialSVDZeroLastRow(int n)
+{
+  //Generate a bidiagonal matrix
+  using Matrix = Kokkos::View<Scalar**, Layout, Kokkos::HostSpace>;
+  using KAT = Kokkos::ArithTraits<Scalar>;
+  Matrix B = createRandomMatrix<Matrix>(n, n, 0, 1.0);
+  //Zero out entries to make B bidiagonal
+  for(int i = 0; i < n; i++)
+  {
+    for(int j = 0; j < n; j++)
+    {
+      if(i != j && i + 1 != j)
+      {
+        B(i, j) = KAT::zero();
+      }
+    }
+  }
+  //Also zero out the final diagonal to test this routine
+  B(n - 1, n - 1) = KAT::zero();
+  Matrix Vt("Vt", n, n);
+  KokkosBatched::SerialSetIdentity::invoke<Matrix>(Vt);
+  //Compute the initial product to make sure it's maintained by the routine
+  Matrix BVt("UBVt", n, n);
+  Test::vanillaGEMM(1.0, B, Vt, 0.0, BVt);
+  //Run the routine (just on host)
+  KokkosBatched::SerialSVDInternal::svdZeroLastColumn<Scalar>(B.data(), n, B.stride(0), B.stride(1), Vt.data(), Vt.stride(0), Vt.stride(1));
+  //Check that B is still bidiagonal (to a tight tolerance, but not exactly zero)
+  for(int i = 0; i < n; i++)
+  {
+    for(int j = 0; j < n; j++)
+    {
+      if(i != j && i + 1 != j)
+      {
+        Test::EXPECT_NEAR_KK(B(i, j), KAT::zero(), Test::svdEpsilon<Scalar>());
+      }
+    }
+  }
+  //Check that the last superdiagonal is now zero
+  Test::EXPECT_NEAR_KK(B(n - 2, n - 1), KAT::zero(), Test::svdEpsilon<Scalar>());
+  //Check that the product is still maintained
+  Matrix BVt2("UBVt", n, n);
+  Test::vanillaGEMM(1.0, B, Vt, 0.0, BVt2);
+  for(int i = 0; i < n; i++)
+  {
+    for(int j = 0; j < n; j++)
+    {
+      Test::EXPECT_NEAR_KK(BVt(i, j), BVt2(i, j), Test::svdEpsilon<Scalar>());
+    }
+  }
+  //Check that Vt is still orthogonal
+  verifyOrthogonal(Vt);
+}
+
+//Test bidiagonal n*n SVD step where some diagonal i (not the last) is 0.
+template<typename Scalar, typename Layout>
+void testSerialSVDZeroDiagonal(int n, int row)
+{
+  //Generate a bidiagonal matrix
+  using Matrix = Kokkos::View<Scalar**, Layout, Kokkos::HostSpace>;
+  using KAT = Kokkos::ArithTraits<Scalar>;
+  int m = n + 2;  //Make U somewhat bigger to make sure the Givens transforms are applied correctly
+  Matrix B = createRandomMatrix<Matrix>(m, n, 0, 1.0);
+  //Zero out entries to make B bidiagonal
+  for(int i = 0; i < m; i++)
+  {
+    for(int j = 0; j < n; j++)
+    {
+      if(i != j && i + 1 != j)
+      {
+        B(i, j) = KAT::zero();
+      }
+    }
+  }
+  //Also zero out a diagonal to test this routine
+  B(row, row) = KAT::zero();
+  Matrix U("U", m, m);
+  KokkosBatched::SerialSetIdentity::invoke<Matrix>(U);
+  //Compute the initial product to make sure it's maintained by the routine
+  Matrix UB("UB", m, n);
+  Test::vanillaGEMM(1.0, U, B, 0.0, UB);
+  //Run the routine (just on host)
+  KokkosBatched::SerialSVDInternal::svdZeroRow<Scalar>(row, B.data(), n, B.stride(0), B.stride(1), U.data(), m, U.stride(0), U.stride(1));
+  //Check that B is still bidiagonal (to a tight tolerance, but not exactly zero)
+  for(int i = 0; i < m; i++)
+  {
+    for(int j = 0; j < n; j++)
+    {
+      if(i != j && i + 1 != j)
+      {
+        Test::EXPECT_NEAR_KK(B(i, j), KAT::zero(), Test::svdEpsilon<Scalar>());
+      }
+    }
+  }
+  //Check that row's diagonal is now zero
+  Test::EXPECT_NEAR_KK(B(row, row), KAT::zero(), Test::svdEpsilon<Scalar>());
+  //Check that the product is still maintained
+  Matrix UB2("UB", m, n);
+  Test::vanillaGEMM(1.0, U, B, 0.0, UB2);
+  for(int i = 0; i < m; i++)
+  {
+    for(int j = 0; j < n; j++)
+    {
+      Test::EXPECT_NEAR_KK(UB(i, j), UB2(i, j), Test::svdEpsilon<Scalar>());
+    }
+  }
+  //Check that U is still orthogonal
+  verifyOrthogonal(U);
+}
+
+template<typename Scalar, typename Layout, typename Device>
+void testSVD()
+{
+  testSerialSVD<Scalar, Layout, Device>(0, 0, 0);
+  testSerialSVD<Scalar, Layout, Device>(1, 0, 0);
+  testSerialSVD<Scalar, Layout, Device>(0, 1, 0);
+  testSerialSVD<Scalar, Layout, Device>(2, 2, 0);
+  testSerialSVD<Scalar, Layout, Device>(2, 2, 1);
+  testSerialSVD<Scalar, Layout, Device>(10, 8, 0);
+  testSerialSVD<Scalar, Layout, Device>(8, 10, 0);
+  testSerialSVD<Scalar, Layout, Device>(10, 1, 0);
+  testSerialSVD<Scalar, Layout, Device>(1, 10, 0);
+  testSerialSVD<Scalar, Layout, Device>(10, 8, 3);
+  testSerialSVD<Scalar, Layout, Device>(8, 10, 4);
+  //Test with all-zero matrix
+  testSerialSVD<Scalar, Layout, Device>(8, 10, 0, 0.0);
+  //Test some important internal routines which are not called often
+  testSerialSVDZeroLastRow<Scalar, Layout>(10);
+  testSerialSVDZeroDiagonal<Scalar, Layout>(10, 3);
+  //Test the mode that just computes singular values
+  testSerialSVDSingularValuesOnly<Scalar, Layout, Device>(10, 8);
+}
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F( TestCategory, batched_scalar_serial_svd_double ) {
+  //Test general SVD on a few different input sizes (full rank randomized)
+  testSVD<double, Kokkos::LayoutLeft, TestExecSpace>();
+  testSVD<double, Kokkos::LayoutRight, TestExecSpace>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F( TestCategory, batched_scalar_serial_svd_float ) {
+  //Test general SVD on a few different input sizes (full rank randomized)
+  testSVD<float, Kokkos::LayoutLeft, TestExecSpace>();
+  testSVD<float, Kokkos::LayoutRight, TestExecSpace>();
+}
+#endif
+