From 4eea1ec19429bf457bf117a752ae4479cabfd7ac Mon Sep 17 00:00:00 2001
From: zhengxianli <zhengxianli@taichi.graphics>
Date: Thu, 13 Apr 2023 11:32:57 +0800
Subject: [PATCH] [perf] Fix Taichi CPU backend compile parameter to pair
 performance with Numba. (#7731)

Issue: #7442

### Brief Summary

In this issue, Numba is a magnitude faster than Taichi due to the
absence of automatic vectorization.
The root cause is the incorrect passage of the `fast_flag`.

To solve this problem, `fast_flag` is now added to the initialization of
cpu codegen. Numba and Taichi now reveal comparable performance.
Here's perf comparison:
numba:            13052.542478MFlops
taichi(master): 6544.274409MFlops
taichi(this pr):  12778.240179MFlops

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/codegen/llvm/codegen_llvm.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index a3133e4c4b2c2..468a501df927b 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -2542,6 +2542,13 @@ void TaskCodeGenLLVM::initialize_context() {
   TI_ASSERT(tlctx != nullptr);
   llvm_context = tlctx->get_this_thread_context();
   builder = std::make_unique<llvm::IRBuilder<>>(*llvm_context);
+  if (compile_config.fast_math) {
+    llvm::FastMathFlags fast_flags;
+    fast_flags.setNoInfs();
+    fast_flags.setNoSignedZeros();
+    fast_flags.setAllowReassoc();
+    builder->setFastMathFlags(fast_flags);
+  }
 }
 
 llvm::Value *TaskCodeGenLLVM::get_arg(int i) {