Skip to content

Commit

Permalink
Fix CI tests
Browse files Browse the repository at this point in the history
  • Loading branch information
frostedoyster committed Apr 16, 2024
1 parent e2edc9e commit ff1eab3
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 74 deletions.
3 changes: 3 additions & 0 deletions mops-torch/src/hpe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,5 +190,8 @@ std::vector<torch::Tensor> HomogeneousPolynomialEvaluationBackward::backward(
);
}

std::cout << grad_grad_output[0] << std::endl;
std::cout << grad_A_2[0] << std::endl;

return {grad_grad_output, grad_A_2, torch::Tensor(), torch::Tensor()};
}
102 changes: 53 additions & 49 deletions mops/src/hpe/cpu.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,25 +189,25 @@ void _homogeneous_polynomial_evaluation_vjp_templated_polynomial_order(
}
}

grad_o_ptr += size_first_dimension_interleft*simd_element_count; // shift grad_o to the remainder values
for (size_t i = 0; i < size_remainder; i++) {
scalar_t grad_output_i = grad_o_ptr[i];
size_t i_shift = i * n_possible_factors;
scalar_t* a_ptr_i = remainder_a_ptr + i_shift;
scalar_t* grad_a_ptr_i = remainder_grad_a_ptr + i_shift;
int32_t* indices_a_ptr_j = indices_a_ptr;
for (size_t j = 0; j < n_monomials; j++) {
scalar_t base_multiplier = grad_output_i*c_ptr[j];
for (uint8_t i_factor = 0; i_factor < polynomial_order; i_factor++) {
scalar_t temp = base_multiplier;
for (uint8_t j_factor = 0; j_factor < polynomial_order; j_factor++) {
if (j_factor == i_factor) continue;
temp *= a_ptr_i[indices_a_ptr_j[j_factor]];
}
grad_a_ptr_i[indices_a_ptr_j[i_factor]] += temp;
scalar_t* grad_o_ptr_i = grad_o_ptr + size_first_dimension_interleft*simd_element_count; // shift grad_o to the remainder values
scalar_t* a_ptr_i = remainder_a_ptr;
scalar_t* grad_a_ptr_i = remainder_grad_a_ptr;
int32_t* indices_a_ptr_j = indices_a_ptr;
for (size_t j = 0; j < n_monomials; j++) {
std::vector<scalar_t> base_multiplier(size_remainder);
for (size_t l = 0; l < size_remainder; l++) base_multiplier[l] = c_ptr[j] * grad_o_ptr_i[l];
for (uint8_t i_factor = 0; i_factor < polynomial_order; i_factor++) {
std::vector<scalar_t> temp(size_remainder);
for (size_t l = 0; l < size_remainder; l++) temp[l] = base_multiplier[l];
for (uint8_t j_factor = 0; j_factor < polynomial_order; j_factor++) {
if (j_factor == i_factor) continue;
scalar_t* a_ptr_i_j_factor = a_ptr_i + indices_a_ptr_j[j_factor] * size_remainder;
for (size_t l = 0; l < size_remainder; l++) temp[l] *= a_ptr_i_j_factor[l];
}
indices_a_ptr_j += polynomial_order;
scalar_t* grad_a_ptr_i_i_factor = grad_a_ptr_i + indices_a_ptr_j[i_factor] * size_remainder;
for (size_t l = 0; l < size_remainder; l++) grad_a_ptr_i_i_factor[l] += temp[l];
}
indices_a_ptr_j += polynomial_order;
}

un_interleave_tensor<scalar_t, simd_element_count>(grad_A, interleft_grad_a_ptr, remainder_grad_a_ptr);
Expand Down Expand Up @@ -288,7 +288,6 @@ void _homogeneous_polynomial_evaluation_vjp_vjp_templated_polynomial_order(

scalar_t* grad_grad_o_ptr = grad_grad_output.data;
scalar_t* grad_o_ptr = grad_output.data;
scalar_t* a_ptr = A.data;
scalar_t* c_ptr = C.data;
int32_t* indices_a_ptr = indices_A.data;

Expand Down Expand Up @@ -399,51 +398,56 @@ void _homogeneous_polynomial_evaluation_vjp_vjp_templated_polynomial_order(
}
}

grad_o_ptr_i = grad_o_ptr + size_first_dimension_interleft*simd_element_count;
a_ptr_i = remainder_a_ptr;
if (compute_grad_grad_output) {
grad_grad_o_ptr += size_first_dimension_interleft*simd_element_count;
grad_grad_o_ptr_i = grad_grad_o_ptr + size_first_dimension_interleft*simd_element_count;
}
grad_o_ptr += size_first_dimension_interleft*simd_element_count;
for (size_t i = 0; i < size_remainder; i++) {
scalar_t grad_output_i = grad_o_ptr[i];
size_t i_shift = i * n_possible_factors;
scalar_t* a_ptr_i = remainder_a_ptr + i_shift;
if (grad_grad_A_is_available) {
grad_grad_a_ptr_i = remainder_grad_grad_a_ptr + i_shift;
}
if (compute_grad_A_2) {
grad_a_2_ptr_i = remainder_grad_a_2_ptr + i_shift;
}
int32_t* indices_a_ptr_j = indices_a_ptr;
for (size_t j = 0; j < n_monomials; j++) {
grad_grad_a_ptr_i = remainder_grad_grad_a_ptr;
grad_a_2_ptr_i = remainder_grad_a_2_ptr;
int32_t* indices_a_ptr_j = indices_a_ptr;
for (size_t j = 0; j < n_monomials; j++) {
if (compute_grad_grad_output) {
scalar_t C_j = c_ptr[j];
if (compute_grad_grad_output) {
scalar_t base_multiplier = grad_output_i * C_j;
std::vector<scalar_t> base_multiplier(size_remainder);
for (size_t l = 0; l < size_remainder; l++) base_multiplier[l] = C_j;
for (uint8_t i_factor = 0; i_factor < polynomial_order; i_factor++) {
scalar_t temp = base_multiplier * grad_grad_a_ptr_i[indices_a_ptr_j[i_factor]];
scalar_t* grad_grad_a_ptr_i_i_factor = grad_grad_a_ptr_i + indices_a_ptr_j[i_factor] * size_remainder;
std::vector<scalar_t> temp(size_remainder);
for (size_t l = 0; l < size_remainder; l++) temp[l] = base_multiplier[l] * grad_grad_a_ptr_i_i_factor[l];
for (uint8_t j_factor = 0; j_factor < polynomial_order; j_factor++) {
if (j_factor == i_factor) continue;
temp *= a_ptr_i[indices_a_ptr_j[j_factor]];
scalar_t* a_ptr_i_j_factor = a_ptr_i + indices_a_ptr_j[j_factor] * size_remainder;
for (size_t l = 0; l < size_remainder; l++) temp[l] *= a_ptr_i_j_factor[l];
}
grad_grad_o_ptr[i] += temp;
for (size_t l = 0; l < size_remainder; l++) grad_grad_o_ptr_i[l] += temp[l];
}
}
if (compute_grad_A_2) {
for (uint8_t j_factor = 0; j_factor < polynomial_order; j_factor++) {
scalar_t base_multiplier = grad_output_i * C_j;
scalar_t temp = base_multiplier;
for (uint8_t i_factor = 0; i_factor < polynomial_order; i_factor++) {
if (i_factor == j_factor) continue;
scalar_t temp2 = temp * grad_grad_a_ptr_i[indices_a_ptr_j[i_factor]];
for (uint8_t k_factor = 0; k_factor < polynomial_order; k_factor++) {
if (k_factor == i_factor || k_factor == j_factor) continue;
temp2 *= a_ptr_i[indices_a_ptr_j[k_factor]];
}
grad_a_2_ptr_i[indices_a_ptr_j[j_factor]] += temp2;
}
if (compute_grad_A_2) {
scalar_t C_j = c_ptr[j];
std::vector<scalar_t> base_multiplier(size_remainder);
for (size_t l = 0; l < size_remainder; l++) base_multiplier[l] = C_j * grad_o_ptr_i[l];
for (uint8_t j_factor = 0; j_factor < polynomial_order; j_factor++) {
std::vector<scalar_t> temp(size_remainder);
for (size_t l = 0; l < size_remainder; l++) temp[l] = base_multiplier[l];
for (uint8_t i_factor = 0; i_factor < polynomial_order; i_factor++) {
if (i_factor == j_factor) continue;
scalar_t* grad_grad_a_ptr_i_i_factor = grad_grad_a_ptr_i + indices_a_ptr_j[i_factor] * size_remainder;
std::vector<scalar_t> temp2(size_remainder);
for (size_t l = 0; l < size_remainder; l++) temp2[l] = temp[l] * grad_grad_a_ptr_i_i_factor[l];
for (uint8_t k_factor = 0; k_factor < polynomial_order; k_factor++) {
if (k_factor == i_factor || k_factor == j_factor) continue;
scalar_t* a_ptr_i_k_factor = a_ptr_i + indices_a_ptr_j[k_factor] * size_remainder;
for (size_t l = 0; l < size_remainder; l++) temp2[l] *= a_ptr_i_k_factor[l];
}
scalar_t* grad_a_2_ptr_i_j_factor = grad_a_2_ptr_i + indices_a_ptr_j[j_factor] * size_remainder;
for (size_t l = 0; l < size_remainder; l++) grad_a_2_ptr_i_j_factor[l] += temp2[l];
}
}
indices_a_ptr_j += polynomial_order;
}
indices_a_ptr_j += polynomial_order;
}

if (compute_grad_A_2) {
Expand Down
26 changes: 13 additions & 13 deletions mops/src/opsaw/opsaw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,19 +92,19 @@ void mops::cuda::

template <typename scalar_t>
void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp(
Tensor<scalar_t, 3> grad_grad_output,
Tensor<scalar_t, 2> grad_A_2,
Tensor<scalar_t, 2> grad_B_2,
Tensor<scalar_t, 2> grad_W_2,
Tensor<scalar_t, 2> grad_grad_A,
Tensor<scalar_t, 2> grad_grad_B,
Tensor<scalar_t, 2> grad_grad_W,
Tensor<scalar_t, 3> grad_output,
Tensor<scalar_t, 2> A,
Tensor<scalar_t, 2> B,
Tensor<scalar_t, 2> W,
Tensor<int32_t, 1> indices_W,
Tensor<int32_t, 1> indices_output
Tensor<scalar_t, 3> /*grad_grad_output*/,
Tensor<scalar_t, 2> /*grad_A_2*/,
Tensor<scalar_t, 2> /*grad_B_2*/,
Tensor<scalar_t, 2> /*grad_W_2*/,
Tensor<scalar_t, 2> /*grad_grad_A*/,
Tensor<scalar_t, 2> /*grad_grad_B*/,
Tensor<scalar_t, 2> /*grad_grad_W*/,
Tensor<scalar_t, 3> /*grad_output*/,
Tensor<scalar_t, 2> /*A*/,
Tensor<scalar_t, 2> /*B*/,
Tensor<scalar_t, 2> /*W*/,
Tensor<int32_t, 1> /*indices_W*/,
Tensor<int32_t, 1> /*indices_output*/
) {
throw std::runtime_error("MOPS was not compiled with CUDA support");
}
Expand Down
24 changes: 12 additions & 12 deletions mops/src/sap/sap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,18 +90,18 @@ void mops::cuda::

template <typename scalar_t>
void mops::cuda::sparse_accumulation_of_products_vjp_vjp(
Tensor<scalar_t, 2> grad_grad_output,
Tensor<scalar_t, 2> grad_A_2,
Tensor<scalar_t, 2> grad_B_2,
Tensor<scalar_t, 2> grad_grad_A,
Tensor<scalar_t, 2> grad_grad_B,
Tensor<scalar_t, 2> grad_output,
Tensor<scalar_t, 2> A,
Tensor<scalar_t, 2> B,
Tensor<scalar_t, 1> C,
Tensor<int32_t, 1> indices_A,
Tensor<int32_t, 1> indices_B,
Tensor<int32_t, 1> indices_output
Tensor<scalar_t, 2> /*grad_grad_output*/,
Tensor<scalar_t, 2> /*grad_A_2*/,
Tensor<scalar_t, 2> /*grad_B_2*/,
Tensor<scalar_t, 2> /*grad_grad_A*/,
Tensor<scalar_t, 2> /*grad_grad_B*/,
Tensor<scalar_t, 2> /*grad_output*/,
Tensor<scalar_t, 2> /*A*/,
Tensor<scalar_t, 2> /*B*/,
Tensor<scalar_t, 1> /*C*/,
Tensor<int32_t, 1> /*indices_A*/,
Tensor<int32_t, 1> /*indices_B*/,
Tensor<int32_t, 1> /*indices_output*/
) {
throw std::runtime_error("MOPS was not compiled with CUDA support");
}
Expand Down

0 comments on commit ff1eab3

Please sign in to comment.