[T 11/17/21 18:54:36.325 12928] [opengl_api.cpp:taichi::lang::opengl::initialize_opengl@54] initialize_opengl(true) called [D 11/17/21 18:54:36.920 12928] [opengl_api.cpp:taichi::lang::opengl::initialize_opengl@97] OpenGL context loaded through GLFW [D 11/17/21 18:54:36.920 12928] [opengl_api.cpp:taichi::lang::opengl::initialize_opengl@187] OpenGL version 4.3 [T 11/17/21 18:54:36.920 12928] [taichi/inc/opengl_extension.inc.h:taichi::lang::opengl::initialize_opengl@3] [glsl] Found GL_ARB_compute_shader [T 11/17/21 18:54:36.920 12928] [taichi/inc/opengl_extension.inc.h:taichi::lang::opengl::initialize_opengl@4] [glsl] Found GL_ARB_gpu_shader_int64 [T 11/17/21 18:54:36.922 12928] [taichi/inc/opengl_extension.inc.h:taichi::lang::opengl::initialize_opengl@5] [glsl] Found GL_NV_shader_atomic_float [T 11/17/21 18:54:36.922 12928] [taichi/inc/opengl_extension.inc.h:taichi::lang::opengl::initialize_opengl@6] [glsl] Found GL_NV_shader_atomic_float64 [T 11/17/21 18:54:36.922 12928] [taichi/inc/opengl_extension.inc.h:taichi::lang::opengl::initialize_opengl@7] [glsl] Found GL_NV_shader_atomic_int64 [T 11/17/21 18:54:36.922 12928] [opengl_api.cpp:taichi::lang::opengl::initialize_opengl@206] GL_MAX_COMPUTE_WORK_GROUP_COUNT: 2147483647 [T 11/17/21 18:54:36.922 12928] [opengl_api.cpp:taichi::lang::opengl::initialize_opengl@209] GL_MAX_COMPUTE_WORK_GROUP_SIZE: 1536 [T 11/17/21 18:54:36.922 12928] [program.cpp:taichi::lang::Program::Program@46] Program initializing... [T 11/17/21 18:54:36.922 12928] [opengl_api.cpp:taichi::lang::opengl::initialize_opengl@54] initialize_opengl(true) called [T 11/17/21 18:54:36.922 12928] [memory_pool.cpp:taichi::lang::MemoryPool::MemoryPool@13] Memory pool created. Default buffer size per allocator = 1024 MB [T 11/17/21 18:54:36.922 12928] [program.cpp:taichi::lang::Program::Program@159] Program (0x233bdfe5d80) arch=opengl initialized. [T 11/17/21 18:54:36.923 12928] [__init__.py:init@548] Materializing runtime... [T 11/17/21 18:54:36.923 12928] [unified_allocator.cpp:taichi::lang::UnifiedAllocator::UnifiedAllocator@32] Allocating virtual address space of size 1024 MB [T 11/17/21 18:54:36.923 12928] [unified_allocator.cpp:taichi::lang::UnifiedAllocator::UnifiedAllocator@41] Memory allocated. Allocation time = 2.90e-05 s [T 11/17/21 18:54:36.923 12928] [taichi/system/unified_allocator.h:taichi::lang::UnifiedAllocator::allocate@39] UM [data=2421380022272] allocate() request=256 remain=1073741824 [T 11/17/21 18:54:36.923 12928] [opengl_api.cpp:taichi::lang::opengl::initialize_opengl@54] initialize_opengl(false) called [T 11/17/21 18:54:36.929 12928] [opengl_program.cpp:taichi::lang::OpenglProgramImpl::compile_snode_tree_types@42] OpenGL root buffer size: 8388608 B [T 11/17/21 18:54:36.933 12928] [kernel_impl.py:materialize@459] Compiling kernel initialize_c48_0... [T 11/17/21 18:54:36.942 12928] [constant_fold.cpp:taichi::lang::ConstantFold::get_jit_evaluator_kernel@68] Saving JIT evaluator cache entry id=18256548145024532736 [D 11/17/21 18:54:36.942 12928] [opengl_api.cpp:taichi::lang::opengl::CompiledProgram::add@257] [glsl] compiling kernel jit_evaluator_00<<<1, 1>>>: #version 430 core layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; precision highp float; layout(std430, binding = 0) buffer data_f32 { float _data_f32_[]; }; layout(std430, binding = 2) buffer args_f32 { float _args_f32_[]; }; const float inf = 1.0f / 0.0f; const float nan = 0.0f / 0.0f; void jit_evaluator_00() { // serial float B = _args_f32_[0 << 1]; float C = _args_f32_[1 << 1]; float D = B * C; _args_f32_[320 >> 2 + 0] = D; } void main() { jit_evaluator_00(); } [D 11/17/21 18:54:36.946 12928] [opengl_api.cpp:taichi::lang::opengl::CompiledProgram::add@257] [glsl] compiling kernel initialize_c48_00<<<8, 128>>>: #version 430 core layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; precision highp float; layout(std430, binding = 0) buffer data_f32 { float _data_f32_[]; }; const float inf = 1.0f / 0.0f; const float nan = 0.0f / 0.0f; void initialize_c48_00() { // range for // range known at compile time int _sid0 = int(gl_GlobalInvocationID.x); for (int _sid = _sid0; _sid < (1024); _sid += int(gl_WorkGroupSize.x * gl_NumWorkGroups.x)) { int _itv = 0 + _sid; int Bp = int(10); int Bf = int(1023); float B = float(0.62831855); int C = _itv; float D = float(C); float E = float(0.31415927); float F = D * E; float G = float(sin(F)); int H = int(0); int I = int(768); int Br = C & Bf; int Bs = Br << Bp; for (int J_ = H; J_ < I; J_ += 1) { int J = J_; int K = J; float L = float(K); float M = L * B; float N = float(sin(M)); float O = G * N; int AZ = 0; int B1 = AZ + 8388608 * H; // S0 int B2 = B1 + 0; // S1 int Bg = K & Bf; int Bo = Bg + Bs; int B6 = B2 + 4 * Bo; // S1 int B7 = B6 + 0; // S2 _data_f32_[B7 >> 2] = O; } } } void main() { initialize_c48_00(); } [Taichi] version 0.8.5, llvm 10.0.0, commit 45c6ad48, win, python 3.9.7 [Taichi] Starting on arch=opengl Wait...[T 11/17/21 18:54:36.947 12928] [kernel_impl.py:materialize@459] Compiling kernel compute_c50_0... [T 11/17/21 18:54:36.956 12928] [constant_fold.cpp:taichi::lang::ConstantFold::get_jit_evaluator_kernel@68] Saving JIT evaluator cache entry id=18256548145091904783 [D 11/17/21 18:54:36.956 12928] [opengl_api.cpp:taichi::lang::opengl::CompiledProgram::add@257] [glsl] compiling kernel jit_evaluator_10<<<1, 1>>>: #version 430 core layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; precision highp float; layout(std430, binding = 0) buffer data_i32 { int _data_i32_[]; }; layout(std430, binding = 2) buffer args_i32 { int _args_i32_[]; }; const float inf = 1.0f / 0.0f; const float nan = 0.0f / 0.0f; void jit_evaluator_10() { // serial int B = _args_i32_[0 << 1]; int C = _args_i32_[1 << 1]; int D = -int(B < C); _args_i32_[320 >> 2 + 0] = D; } void main() { jit_evaluator_10(); } [T 11/17/21 18:54:36.958 12928] [constant_fold.cpp:taichi::lang::ConstantFold::get_jit_evaluator_kernel@68] Saving JIT evaluator cache entry id=18256548145091904777 [D 11/17/21 18:54:36.958 12928] [opengl_api.cpp:taichi::lang::opengl::CompiledProgram::add@257] [glsl] compiling kernel jit_evaluator_20<<<1, 1>>>: #version 430 core layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; precision highp float; layout(std430, binding = 0) buffer data_i32 { int _data_i32_[]; }; layout(std430, binding = 2) buffer args_i32 { int _args_i32_[]; }; const float inf = 1.0f / 0.0f; const float nan = 0.0f / 0.0f; void jit_evaluator_20() { // serial int B = _args_i32_[0 << 1]; int C = _args_i32_[1 << 1]; int D = B & C; _args_i32_[320 >> 2 + 0] = D; } void main() { jit_evaluator_20(); } [D 11/17/21 18:54:36.961 12928] [opengl_api.cpp:taichi::lang::opengl::CompiledProgram::add@257] [glsl] compiling kernel compute_c50_00<<<1536, 128>>>: #version 430 core layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; precision highp float; layout(std430, binding = 0) buffer data_f32 { float _data_f32_[]; }; const float inf = 1.0f / 0.0f; const float nan = 0.0f / 0.0f; void compute_c50_00() { // range for // range known at compile time int _sid0 = int(gl_GlobalInvocationID.x); for (int _sid = _sid0; _sid < (1048576); _sid += int(gl_WorkGroupSize.x * gl_NumWorkGroups.x)) { int _itv = 0 + _sid; int EL = int(1023); int En = int(0); int U = int(1024); int T = int(10); int R = int(768); int D = _itv; int E4 = D >> T; int Ea = D & EL; int O = -int(Ea < R); if (O != 0) { float V = float(0); for (int W_ = En; W_ < U; W_ += 1) { int W = W_; int X = W; int Eb = X / U; int Ed = -int(X < En); int EW = Eb << T; int Eg = -int(Ed != En); int Eh = -int(X != En); int Ei = -int(EW != X); int Ej = Eg & Eh; int Ek = Ej & Ei; int El = Eb + Ek; int Z = El << T; int Aq = X - Z; int Ft = Aq & EL; int Fu = Ft << T; for (int Ar_ = En; Ar_ < R; Ar_ += 1) { int Ar = Ar_; int As = Ar; int Em = As / R; int Eo = -int(As < En); int Eq = Em * R; int Er = -int(Eo != En); int Es = -int(As != En); int Et = -int(Eq != As); int Eu = Er & Es; int Ev = Eu & Et; int Ew = Em + Ev; int Au = Ew * R; int Av = As - Au; int DH = 0; int DJ = DH + 8388608 * En; // S0 int DK = DJ + 0; // S1 int EE = Av & EL; int Fc = EE + Fu; int DO = DK + 4 * Fc; // S1 int DP = DO + 0; // S2 float Ax = _data_f32_[DP >> 2]; float Ay = V; float Az = Ay + Ax; V = Az; } } float AB = V; int DT = 0; int DV = DT + 8388608 * En; // S0 int DW = DV + 4194304; // S3 int Fe = E4 & EL; int Fs = Fe << T; int F4 = Ea + Fs; int E0 = DW + 4 * F4; // S3 int E1 = E0 + 0; // S4 _data_f32_[E1 >> 2] = AB; } } } void main() { compute_c50_00(); } [T 11/17/21 18:54:36.961 12928] [kernel_impl.py:materialize@459] Compiling kernel tensor_to_ext_arr_c4_0...