[Example] add N-body of O(N log N) make use of sparse #814

archibate · 2020-04-18T03:43:04Z

Concisely describe the proposed feature

Describe the solution you'd like (if any)

Additional comments

archibate · 2020-04-18T05:04:40Z

https://github.com/EnthalpyBill/Tree-Gravity-Python
https://zhuanlan.zhihu.com/p/90093326

archibate · 2020-04-18T06:25:30Z

#include <GL/glut.h>
#include <glm/glm.hpp>
#include <glm/gtc/type_ptr.hpp>
#include <vector>
#include <array>
#include <cstdio>
#include <cmath>

const int RES = 512;
const int IVAL = 1000.0 / 300;
const float SIZ = 1.2;
const float DT = 0.001;
const float SPED = 1.0;
const float GRAV = 1.0;

inline glm::vec2 gravity(glm::vec2 dis, float mass)
{
    auto d = glm::length(dis);
    if (d < 1e-5) return glm::vec2(0.0);
    auto sc = mass / (d * d * d);
    return dis * sc;
}

struct Particle {
    glm::vec2 pos, vel;
    float mass;
    bool background;

    Particle(glm::vec2 &&pos, glm::vec2 &&vel,
            float mass = 0.0, bool background = false)
        : pos(pos), vel(vel * SPED * DT)
        , mass(mass * GRAV * DT * DT)
        , background(background)
    {}

    void render(void) const
    {
        glVertex3fv(glm::value_ptr(pos));
    }
};

std::vector<Particle> P;

int bv2i(glm::bvec2 const &b)
{
    return int(b.x) + int(b.y) * 2;
}

glm::vec2 i2bvf(int i)
{
    return glm::vec2(i % 2, i / 2);
}

struct Quadtree {

    glm::vec2 topleft;
    float size;
    int par{-1};
    std::array<Quadtree *, 4> ch;

    constexpr glm::vec2 fullsize() const
    {
        return glm::vec2(size);
    }

    constexpr glm::vec2 halfsize() const
    {
        return glm::vec2(size * 0.5);
    }

    float mass() const
    {
        float m = 0;
        for (auto &&c: ch) {
            if (c) m += c->mass();
        }
        if (par != -1) m += P[par].mass;
        return m;
    }

    glm::vec2 mass_center() const
    {
        glm::vec2 p(0.0);
        float m = 0;
        for (auto &&c: ch) {
            if (!c) continue;
            auto ma = c->mass();
            p += ma * c->mass_center();
            m += ma;
        }
        if (par != -1) {
            m += P[par].mass;
            p += P[par].pos * P[par].mass;
        }
        return m == 0.0 ? this->center() : p / m;
    }

    glm::vec2 center() const
    {
        return topleft + halfsize();
    }

    float factor(glm::vec2 const &pos) const
    {
        auto d = this->center() - pos;
        return size / glm::length(d);
    }

    glm::vec2 visit(glm::vec2 const &pos, int depth = 0) const
    {
        float fac = 0;
        glm::vec2 acc(0.0);
        for (auto &&c: ch) {
            if (!c) continue;
            auto f = c->factor(pos);
            if (f >= 1)
                acc += c->visit(pos);
            else
                acc += gravity(c->mass_center() - pos, c->mass());
        }
        if (par != -1) {
            acc += gravity(P[par].pos - pos, P[par].mass);
        }
        return acc;
    }

    bool in_range(glm::vec2 const &pos) const
    {
        return glm::all(glm::greaterThanEqual(pos, topleft))
            && glm::all(glm::lessThan(pos, topleft + fullsize()));
    }

    glm::bvec2 which_range(glm::vec2 const &pos) const
    {
        return glm::greaterThanEqual(pos, center());
    }

    void build(std::vector<int> const &ps, int depth = 0)
    {
        if (ps.size() == 0) return;
        if (ps.size() == 1) {
            par = ps[0];
            return;
        }

        std::vector<int> npss[4];
        for (auto &&p: ps) {
            if (!in_range(P[p].pos)) continue;
            int idx = bv2i(which_range(P[p].pos));
            npss[idx].push_back(p);
        }
        for (int i = 0; i < 4; i++) {
            if (npss[i].size()) {
                auto b = i2bvf(i);
                ch[i] = new Quadtree{topleft + halfsize() * b, size * 0.5f};
                ch[i]->build(npss[i], depth + 1);
            }
        }
    }

    void render(int depth = 0)
    {
        glLineWidth(par != -1 ? 2 : 1);
        glColor4f(par != -1 ? 0.9 : 0.3, 0.0, 0.0, 0.1);
        glBegin(GL_LINE_LOOP);
        glVertex2f(topleft.x, topleft.y);
        glVertex2f(topleft.x + size, topleft.y);
        glVertex2f(topleft.x + size, topleft.y + size);
        glVertex2f(topleft.x, topleft.y + size);
        glEnd();
        for (auto &&c: ch) {
            if (c) c->render(depth + 1);
        }
    }
};

Quadtree *qt;

void build_tree(void)
{
    qt = new Quadtree{glm::vec2(0.0), SIZ};
    std::vector<int> ps;
    for (int i = 0; i < P.size(); i++) {
        ps.push_back(i);
    }
    qt->build(ps);
}

glm::vec2 field(glm::vec2 const &pos)
{
    return qt->visit(pos);  /* comment this line for O(N*N) computation */
    glm::vec2 acc(0.0);
    for (auto &&p: P) {
        acc += gravity(p.pos - pos, p.mass);
    }
    return acc;
}

void advance(void)
{
    build_tree();
    float dt = (rand() / RAND_MAX) * 0.7 + 0.4;
    for (auto &&p: P) {
        if (p.background)
            continue;
        auto acc = field(p.pos);
        p.vel += acc * dt;
    }
    for (auto &&p: P) {
        p.pos += p.vel * dt;
    }
}

void init(void)
{
#if 1
    P.push_back(Particle(
            glm::vec2(0.47, 0.50),
            glm::vec2(0.00, 0.40),
            0.050));
    P.push_back(Particle(
            glm::vec2(0.53, 0.50),
            glm::vec2(0.00, -0.40),
            0.050));
    P.push_back(Particle(
            glm::vec2(1.10, 0.71),
            glm::vec2(0.00, 0.27),
            0.002));
#else
    P.push_back(Particle(
            glm::vec2(0.50, 0.44),
            glm::vec2(0.00, 0.00),
            0.050));
    P.push_back(Particle(
            glm::vec2(0.50, 0.56),
            glm::vec2(0.10, 0.00),
            0.050));
#endif

    float mass = 0.0;
    glm::vec2 adjust(0.0);
    for (auto &&p: P) {
        mass += p.mass;
        adjust += p.vel * p.mass;
    }
    adjust /= mass;
    for (auto &&p: P) {
        if (p.mass != 0.0)
            p.vel -= adjust;
    }
}

void display(void)
{
    glEnable(GL_BLEND);
    glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
    glColor4f(0.0, 0.0, 0.0, 0.05);
    glRectf(0.0, 0.0, SIZ, SIZ);
    glColor4f(1.0, 1.0, 1.0, 1.0);
    glPointSize(3);
    glBegin(GL_POINTS);
    for (auto &&p: P) {
        p.render();
    }
    glEnd();
    if (qt) qt->render();
    glFlush();
}

void timer(int id)
{
    advance();
    advance();
    advance();
    advance();
    advance();
    glutPostRedisplay();
    glutTimerFunc(IVAL, timer, 1);
}

void keyboard(unsigned char key, int x, int y)
{
    if (key == 27)
        exit(0);
}

int main(int argc, char **argv)
{
    glutInit(&argc, argv);

    glutInitDisplayMode(GLUT_SINGLE | GLUT_RGB);
    glutInitWindowSize(RES, RES);
    glutCreateWindow("N-body");

    glutKeyboardFunc(keyboard);
    glutDisplayFunc(display);
    glutTimerFunc(IVAL, timer, 1);

    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    gluOrtho2D(0.0, SIZ, 0.0, SIZ);
    glShadeModel(GL_FLAT);

    init();

    glutMainLoop();
}

archibate · 2020-04-18T14:32:03Z

@yuanming-hu could you write a doc on sparse data structures? So that I know how to make use of it to make octrees?

yuanming-hu · 2020-04-18T14:37:18Z

Right, I should do that once I get a chance...

archibate · 2020-04-18T15:07:44Z

C++ version is done now, have fun! You may notice that the precession is gone after we uncomment the line return qt->visit(pos);, this means we did ignored the binary star's size now, and thus we are on O(n*log n) now!
To make precession back again and thus make the result precise as O(n*n) does, we may extend Quadtree::mass to Quadtree::quadpole, finally when move it to taichi, we should obtain a python version of https://github.com/mockingbirdnest/Principia and hopefully more better performance :)
Anyway, no rush at all, this is after v0.6.0 anyway... Hopefully I will write an article on this n-body example to advertise taichi...definitely we will attract a lot physics people!

archibate · 2020-04-18T16:39:11Z

I found a bug when writing python n-body on ti.x64:

@ti.kernel
def adjust_momentum():
    adj = ti.Vector([0.0, 0.0])
    mac = 0.0
    for i in pos:
        mac += mass[i]
        adj += vel[i] * mac
    adj = adj / mac
    for i in pos:
        vel[i] += adj

obtains:

[I 04/19/20 00:42:15.293] [compile_to_offloads.cpp:operator()@17] Constant folded:
kernel {
  <f32 x1> $0 = alloca
  <f32 x1> $1 = const [0.0]
  <f32 x1> $2 : local store [$0 <- $1]
  <f32 x1> $3 = alloca
  <f32 x1> $4 : local store [$3 <- $1]
  <f32 x1> $5 = alloca
  <f32 x1> $6 : local store [$5 <- $1]
  <i32 x1> $7 = alloca
  <i32 x1> $8 = const [0]
  <i32 x1> $9 = const [4]
  $10 : for $7 in range($8, $9, step 1) {
    <i32 x1> $11 = local load [ [$7[0]]]
    <i32 x1> $12 = const [-1]
    <i32 x1> $13 = bit_extract($11 + 0, 0~2)
    <i32 x1> $14 = const [3]
    <i32 x1> $15 = cmp_lt $13 $14
    <i32 x1> $16 = bit_and $12 $15
    $17 : if $16 {
      <f32*x1> $18 = global ptr [S8place_f32], index [$13] activate=false
      <gen*x1> $19 = get root
      <i32 x1> $20 = const [0]
      <gen*x1> $21 = [S0root][root]::lookup($19, $20) activate = false
      <gen*x1> $22 = get child [S0root->S7dense] $21
      <i32 x1> $23 = bit_extract($13 + 0, 0~2)
      <i32 x1> $24 = const [1]
      <i32 x1> $25 = mul $23 $24
      <i32 x1> $26 = add $20 $25
      <gen*x1> $27 = [S7dense][dense]::lookup($22, $26) activate = false
      <f32*x1> $28 = get child [S7dense->S8place_f32] $27
      <f32 x1> $29 = global load $28
      <f32 x1> $30 = atomic add($5, $29)
      <f32*x1> $31 = global ptr [S5place_f32], index [$13] activate=false
      <gen*x1> $32 = get child [S0root->S4dense] $21
      <gen*x1> $33 = [S4dense][dense]::lookup($32, $26) activate = false
      <f32*x1> $34 = get child [S4dense->S5place_f32] $33
      <f32 x1> $35 = global load $34
      <f32 x1> $36 = local load [ [$5[0]]]
      <f32 x1> $37 = mul $35 $36
      <f32*x1> $38 = global ptr [S6place_f32], index [$13] activate=false
      <f32*x1> $39 = get child [S4dense->S6place_f32] $33
      <f32 x1> $40 = global load $39
      <f32 x1> $41 = mul $40 $36
      <f32 x1> $42 = atomic add($0, $37)
      <f32 x1> $43 = atomic add($3, $41)
    }
  }
  <f32 x1> $44 = local load [ [$0[0]]]
  <f32 x1> $45 = local load [ [$5[0]]]
  <f32 x1> $46 = div $44 $45
  <f32 x1> $47 = local load [ [$3[0]]]
  <f32 x1> $48 = div $47 $45
  <i32 x1> $49 = alloca
  $50 : for $49 in range($8, $9, step 1) {
    <i32 x1> $51 = local load [ [$49[0]]]
    <i32 x1> $52 = const [-1]
    <i32 x1> $53 = bit_extract($51 + 0, 0~2)
    <i32 x1> $54 = const [3]
    <i32 x1> $55 = cmp_lt $53 $54
    <i32 x1> $56 = bit_and $52 $55
    $57 : if $56 {
      <f32*x1> $58 = global ptr [S5place_f32], index [$53] activate=false
      <gen*x1> $59 = get root
      <i32 x1> $60 = const [0]
      <gen*x1> $61 = [S0root][root]::lookup($59, $60) activate = false
      <gen*x1> $62 = get child [S0root->S4dense] $61
      <i32 x1> $63 = bit_extract($53 + 0, 0~2)
      <i32 x1> $64 = const [1]
      <i32 x1> $65 = mul $63 $64
      <i32 x1> $66 = add $60 $65
      <gen*x1> $67 = [S4dense][dense]::lookup($62, $66) activate = false
      <f32*x1> $68 = get child [S4dense->S5place_f32] $67
      <f32 x1> $69 = atomic add($68, $46)
      <f32*x1> $70 = global ptr [S6place_f32], index [$53] activate=false
      <f32*x1> $71 = get child [S4dense->S6place_f32] $67
      <f32 x1> $72 = atomic add($71, $48)
    }
  }
}
[I 04/19/20 00:42:15.294] [compile_to_offloads.cpp:operator()@17] Offloaded:
kernel {
  $0 = offloaded  {
    <f32*x1> $1 = global tmp var (offset = 4 B)
    <f32 x1> $2 = const [0.0]
    <f32*x1> $3 : global store [$1 <- $2]
    <f32 x1> $4 = const [0.0]
    <f32*x1> $5 = global tmp var (offset = 4 B)
    <f32 x1> $6 = const [0.0]
    <f32*x1> $7 : global store [$5 <- $6]
    <f32*x1> $8 = global tmp var (offset = 8 B)
    <f32 x1> $9 = const [0.0]
    <f32*x1> $10 : global store [$8 <- $9]
    <f32*x1> $11 = global tmp var (offset = 8 B)
    <f32 x1> $12 = const [0.0]
    <f32*x1> $13 : global store [$11 <- $12]
    <f32*x1> $14 = global tmp var (offset = 0 B)
    <f32 x1> $15 = const [0.0]
    <f32*x1> $16 : global store [$14 <- $15]
    <f32*x1> $17 = global tmp var (offset = 0 B)
    <f32 x1> $18 = const [0.0]
    <f32*x1> $19 : global store [$17 <- $18]
    <i32 x1> $20 = alloca
    <i32 x1> $21 = const [0]
    <i32 x1> $22 = const [4]
  }
  $23 = offloaded range_for(0, 4) block_dim=adaptive {
    <i32 x1> $24 = loop index 0
    <i32 x1> $25 = const [-1]
    <i32 x1> $26 = bit_extract($24 + 0, 0~2)
    <i32 x1> $27 = const [3]
    <i32 x1> $28 = cmp_lt $26 $27
    <i32 x1> $29 = bit_and $25 $28
    $30 : if $29 {
      <f32*x1> $31 = global ptr [S8place_f32], index [$26] activate=false
      <gen*x1> $32 = get root
      <i32 x1> $33 = const [0]
      <gen*x1> $34 = [S0root][root]::lookup($32, $33) activate = false
      <gen*x1> $35 = get child [S0root->S7dense] $34
      <i32 x1> $36 = bit_extract($26 + 0, 0~2)
      <i32 x1> $37 = const [1]
      <i32 x1> $38 = mul $36 $37
      <i32 x1> $39 = add $33 $38
      <gen*x1> $40 = [S7dense][dense]::lookup($35, $39) activate = false
      <f32*x1> $41 = get child [S7dense->S8place_f32] $40
      <f32 x1> $42 = global load $41
      <f32*x1> $43 = global tmp var (offset = 0 B)
      <f32 x1> $44 = atomic add($43, $42)
      <f32*x1> $45 = global ptr [S5place_f32], index [$26] activate=false
      <gen*x1> $46 = get child [S0root->S4dense] $34
      <gen*x1> $47 = [S4dense][dense]::lookup($46, $39) activate = false
      <f32*x1> $48 = get child [S4dense->S5place_f32] $47
      <f32 x1> $49 = global load $48
      <f32*x1> $50 = global tmp var (offset = 0 B)
      <f32 x1> $51 = global load $50
      <f32 x1> $52 = mul $49 $51
      <f32*x1> $53 = global ptr [S6place_f32], index [$26] activate=false
      <f32*x1> $54 = get child [S4dense->S6place_f32] $47
      <f32 x1> $55 = global load $54
      <f32 x1> $56 = mul $55 $51
      <f32*x1> $57 = global tmp var (offset = 4 B)
      <f32 x1> $58 = atomic add($57, $52)
      <f32*x1> $59 = global tmp var (offset = 8 B)
      <f32 x1> $60 = atomic add($59, $56)
    }
  }
  $61 = offloaded  {
    <f32*x1> $62 = global tmp var (offset = 4 B)
    <f32 x1> $63 = global load $62
    <f32*x1> $64 = global tmp var (offset = 0 B)
    <f32 x1> $65 = global load $64
    <f32 x1> $66 = div $63 $65
    <f32*x1> $67 = global tmp var (offset = 8 B)
    <f32 x1> $68 = global load $67
    <f32 x1> $69 = div $68 $65
    <i32 x1> $70 = alloca
  }
  $71 = offloaded range_for(0, 4) block_dim=adaptive {
    <i32 x1> $72 = loop index 0
    <i32 x1> $73 = const [-1]
    <i32 x1> $74 = bit_extract($72 + 0, 0~2)
    <i32 x1> $75 = const [3]
    <i32 x1> $76 = cmp_lt $74 $75
    <i32 x1> $77 = bit_and $73 $76
    $78 : if $77 {
      <f32*x1> $79 = global ptr [S5place_f32], index [$74] activate=false
      <gen*x1> $80 = get root
      <i32 x1> $81 = const [0]
      <gen*x1> $82 = [S0root][root]::lookup($80, $81) activate = false
      <gen*x1> $83 = get child [S0root->S4dense] $82
      <i32 x1> $84 = bit_extract($74 + 0, 0~2)
      <i32 x1> $85 = const [1]
      <i32 x1> $86 = mul $84 $85
      <i32 x1> $87 = add $81 $86
      <gen*x1> $88 = [S4dense][dense]::lookup($83, $87) activate = false
      <f32*x1> $89 = get child [S4dense->S5place_f32] $88
      <f32 x1> $90 = atomic add($89, $66)
      <f32*x1> $91 = global ptr [S6place_f32], index [$74] activate=false
      <f32*x1> $92 = get child [S4dense->S6place_f32] $88
      <f32 x1> $93 = atomic add($92, $69)
    }
  }
}
[E 04/19/20 00:38:07.480] [verify.cpp:basic_verify@37] stmt 90 cannot have operand 66.

It cannot because $66 exist in the previous offload, and the constant_fold optimized it even if it's crossing offloaded tasks.
Maybe we can move Constant Fold after Offloaded? @xumingkuan @yuanming-hu

In additional, if I change it to vel[i] -= adj, no error, but giving bad adjust result. (momentum not cleared, and became bigger than before)
I'll try to write a minimal example to reproduce this bug.

yuanming-hu · 2020-04-18T16:47:06Z

Right, this is indeed a bug, being fixed in #813...

yuanming-hu · 2020-04-18T16:50:26Z

I merged #813 in. Could you retry with the latest commit?

archibate · 2020-04-18T16:53:51Z

Great! Building now, but can you tell why -= doesn't have that error while += has? Did we treat them differently somewhere?

archibate · 2020-04-18T16:57:19Z

After merging with latest commit, the issue still exists...

archibate · 2020-04-18T17:03:51Z

I realized this is not about constant_fold.

<f32 x1> $67 = local load [ [$0[0]]]
  <f32 x1> $68 = local load [ [$5[0]]]
  <f32 x1> $69 = div $67 $68
  <f32 x1> $70 : local store [$0 <- $69]
  <f32 x1> $71 = local load [ [$3[0]]]
  <f32 x1> $72 = div $71 $68
  <f32 x1> $73 : local store [$3 <- $72]
  <i32 x1> $74 = alloca

after Simplified II:

<f32 x1> $44 = local load [ [$0[0]]]
  <f32 x1> $45 = local load [ [$5[0]]]
  <f32 x1> $46 = div $44 $45
  <f32 x1> $47 = local load [ [$3[0]]]
  <f32 x1> $48 = div $47 $45
  <i32 x1> $49 = alloca

The local store was eliminated! Why? Can we do full_simplify after offloaded? You don't want to bother offload bounderies.

archibate · 2020-04-18T17:13:19Z

I removed full_simplify and compilation passed.
Now things gets more interesting:

@ti.kernel
def adjust_momentum():
    adj = ti.Vector([0.0, 0.0])
    mac = 0.0
    for i in vel:
        mac += mass[i]
        adj += vel[i] * mac
    adj /= mac
    print(adj[0])
    print(adj[1])
    for i in pos:
        print(adj[0])
        print(adj[1])
        vel[i] += adj

obtains:

[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000466
[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000466
[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000466
[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000466

If I change vel[i] += adj to vel[i] -= adj:

[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000074
[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000074
[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000074
[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000074

Schrodinger's cat?? Why changing codes after a line can affect that line???

archibate · 2020-04-18T17:24:04Z

SIZ = 1.2
DT = 0.001

ti.init(arch=ti.x64)

N = 3
pos = ti.Vector(2, dt=ti.f32, shape=N)
vel = ti.Vector(2, dt=ti.f32, shape=N)
mass = ti.var(dt=ti.f32, shape=N)

@ti.func
def gravity(dis, m):
    d = dis.norm()
    ret = ti.Vector([0.0, 0.0])
    if d >= 1e-5:
        sc = m / d ** 3
        ret = dis * sc
    return ret

@ti.func
def field(p):
    acc = ti.Vector([0.0, 0.0])
    for i in range(N):
        acc += gravity(pos[i] - p, mass[i])
    return acc

@ti.kernel
def advance():
    dt = ti.random() * 0.7 + 0.4
    for i in pos:
        acc = field(pos[i])
        vel[i] += acc * dt
    for i in pos:
        pos[i] += vel[i] * dt


@ti.kernel
def adjust_momentum():
    adj = ti.Vector([0.0, 0.0])
    mac = 0.0
    for i in vel:
        mac += mass[i]
        adj += vel[i] * mac
    adj /= mac
    print(adj[0])
    print(adj[1])
    for i in pos:
        print(adj[0])
        print(adj[1])
        vel[i] += adj

def initialize():
    pos.from_numpy(np.array([0.47, 0.50, 0.53, 0.50, 1.10, 0.71]).reshape(N, 2))
    vel.from_numpy(np.array([0.00, 0.40, 0.00, -0.4, 0.00, 0.27]).reshape(N, 2) * DT)
    mass.from_numpy(np.array([0.050, 0.050, 0.000]) * DT ** 2)


initialize()
adjust_momentum()

gui = ti.GUI('N-body')
while not gui.get_event(ti.GUI.PRESS):
    for i in range(5):
        _pos = pos.to_numpy()
        gui.circles(_pos / SIZ, radius=5, color=0x333333 * i)
        for i in range(5):
            advance()
    gui.show()

Have ~~bug~~ fun!

yuanming-hu · 2020-04-18T20:40:42Z

Thanks for sharing this. There's actually a racing condition (which is undefined behavior in Taichi) in this piece of code and I'm not surprised that you are getting weird behaviors.

    adj = ti.Vector([0.0, 0.0])
    mac = 0.0
    for i in vel:
        mac += mass[i]
        adj += vel[i] * mac

Note that mac is read and written in parallel.

I believe you meant

    adj = ti.Vector([0.0, 0.0])
    mac = 0.0
    for i in vel: # Note this loop is parallel
        mac += mass[i]
        adj += vel[i] * mass[i] # instead of mac

archibate · 2020-04-19T05:42:41Z

Another bug:

@ti.kernel
def advance():
    for i in range(N):
        n = ti.cast(ti.max(vel[i].norm() / VEL0 + 0.5, 1), ti.i32)
        cnt[i] = n
    for i in range(N):
        dt = 1 / cnt[i]
        for t in range(cnt[i]):
            acc[i] = field(pos[i])
            print(acc[i][0] * INV_DT ** 2)
            for s in range(1):
                vel[i] += acc[i] * dt
                pos[i] += vel[i] * dt

obtains:

[debug] acc[i][0] * INV_DT ** 2 = 0.000000
[debug] acc[i][0] * INV_DT ** 2 = 0.000000
[debug] acc[i][0] * INV_DT ** 2 = 0.000000

If I replace for s in range(1): by if 1:, it becomes normal:

[debug] acc[i][0] * INV_DT ** 2 = 0.000000
[debug] acc[i][0] * INV_DT ** 2 = -0.211246
[debug] acc[i][0] * INV_DT ** 2 = -0.211451
[debug] acc[i][0] * INV_DT ** 2 = 0.000000
[debug] acc[i][0] * INV_DT ** 2 = -0.211657
[debug] acc[i][0] * INV_DT ** 2 = -0.211862

archibate · 2020-04-19T06:12:19Z

@yuanming-hu I suddenly realized that the differentiable programming may help us to calculate gravity force vector from potential energy!!!

archibate · 2020-04-19T10:16:46Z

import taichi as ti
import numpy as np
from random import Random

rand_gen = Random()

SIZ = 1.2
DT = 0.0005
INV_DT = 1 / DT
INV_VEL0 = INV_DT / 0.2
INV_ACC0 = INV_DT ** 2 / 0.2
MAX_LOSS = 2 ** 14
MIN_DIS3 = 4e-7

ti.init(arch=ti.cuda)#, print_ir=True)

N = 32
T = 9
pos = ti.Vector(2, dt=ti.f32)
acc = ti.Vector(2, dt=ti.f32)
vel = ti.Vector(2, dt=ti.f32)
mass = ti.var(dt=ti.f32)
pot = ti.var(dt=ti.i32)
max_pot = ti.var(dt=ti.i32)

d_x = ti.root.dense(ti.i, N)
b_x = ti.root.bitmasked(ti.i, N)
d_x.place(pos, vel, pot, mass)
b_x.place(acc)
ti.root.place(max_pot)

@ti.func
def gravity(dis, m):
    d = dis.norm()
    ret = ti.Vector([0.0, 0.0])
    ret = dis * (m / (MIN_DIS3 + d ** 3))
    return ret

@ti.func
def field(p):
    a = ti.Vector([0.0, 0.0])
    for i in range(N):
        a += gravity(pos[i] - p, mass[i])
    return a

@ti.kernel
def calc_pot():
    max_pot[None] = 0
    for i in range(N):
        loss = acc[i].norm() * INV_ACC0
        loss += vel[i].norm() * INV_VEL0
        n = ti.min(ti.max(loss, 1), MAX_LOSS)
        pot[i] = ti.cast(ti.log(n) / ti.log(2.0) + 0.5, ti.i32)
        ti.atomic_max(max_pot[None], pot[i])

@ti.kernel
def advance_sub(seq: ti.i32):#, dt0: ti.f32):
    for i in range(N):
        p = max_pot[None] - pot[i]
        tp = 2 ** p
        if seq % tp:
            ti.deactivate(b_x, i)
            continue
        acc[i] = field(pos[i])

    for i in acc:
        steps = 2 ** pot[i]
        dt = 1 / steps
        vel[i] += acc[i] * dt
        pos[i] += vel[i] * dt

@ti.kernel
def bouncing_wall():
    for i in range(N):
        for s in ti.static(range(2)):
            if pos[i][s] >= SIZ and vel[i][s] > 0:
                vel[i][s] = -vel[i][s]
            elif pos[i][s] <= 0 and vel[i][s] < 0:
                vel[i][s] = -vel[i][s]

def advance():
        calc_pot()
        max_seq = 2 ** max_pot[None]
        for seq in range(max_seq):
            advance_sub(seq)#, rand_gen.gauss(1, 0.1))
        bouncing_wall()

@ti.kernel
def adjust_momentum():
    mac = 0.0
    adj = ti.Vector([0.0, 0.0])
    for i in range(N):
        mac += mass[i]
        adj += vel[i] * mass[i]
    adj /= mac
    for i in range(N):
        vel[i] -= adj

@ti.kernel
def calc_energy():
    e = 0.0
    for i in range(N):
        e += mass[i] * vel[i].norm_sqr()
    for i in range(N):
        phi = 0.0
        for j in range(N):
            if j == i: continue
            d = pos[i] - pos[j]
            phi += mass[j] / d.norm()
        e -= phi * mass[i]
    e *= INV_DT ** 4
    print(e)

@ti.kernel
def calc_momentum():
    p = ti.Vector([0.0, 0.0])
    for i in range(N):
        p += vel[i] * mass[i]
    p *= INV_DT ** 3
    print(p[0])
    print(p[1])

@ti.kernel
def initialize():
    for i in range(N):
        pos[i] = ti.Vector([ti.random(), ti.random()]) * SIZ
        vel[i] = (ti.Vector([ti.random(), ti.random()]) - 0.5) * 6.0 * DT
        mass[i] = 0.1 * ti.random() * DT ** 2


initialize()
#adjust_momentum()

gui = ti.GUI('N-body')
for frame in range(100000):
    if gui.get_event(ti.GUI.PRESS):
        break
    for i in range(4):
        advance()
    _pos = pos.to_numpy()
    gui.circles(_pos / SIZ, radius=2)
    gui.show()
    if frame % 32 == 0:
        calc_energy()

Update: dispatch more computation resource to those bodies in deeper gravity well so that energy is conserved. (i.e. dt is smaller for those particles whose acc are big)
Not sure if I'm using bitmasked correctly...

archibate · 2020-04-19T15:37:58Z

How to use pointer? Could you write a simple sparse quadtree example for me? It would be very helpful, thank you!
BUG again: triple embbed for-loop not working.
Gonna sleep now, and thinking if we want a seperate issue for this BUG.

KLozes · 2020-04-19T17:32:02Z

@archibate I think @yuanming-hu 's Taichi paper would be a good place to start reading about storing particles in sparse data structures. http://taichi.graphics/wp-content/uploads/2019/09/taichi_lang.pdf . You can look at the taichi_sparse.py example for the new syntax.

Also, you probably would not want to create a quadtree directly from taichi pointer data structures. Instead, you want to create a pyramid of sparse grids with increasing resolution to emulate an quadtree. An example of this is in the mgpcg.py example.

archibate added the feature request Suggest an idea on this project label Apr 18, 2020

archibate self-assigned this Apr 18, 2020

yuanming-hu mentioned this issue Apr 18, 2020

[ir] Fix compilation crash when offloading LocalStoreStmt and AtomicOpStmt #813

Merged

archibate mentioned this issue Apr 20, 2020

[Example] add quadtree.py #824

Merged

archibate mentioned this issue Jul 8, 2020

[Example] Add tree method n-body solver of O(n log n): tree_gravity.py #1440

Merged

archibate closed this as completed in #1440 Jul 12, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Example] add N-body of O(N log N) make use of sparse #814

[Example] add N-body of O(N log N) make use of sparse #814

archibate commented Apr 18, 2020

archibate commented Apr 18, 2020 •

edited

Loading

archibate commented Apr 18, 2020 •

edited

Loading

archibate commented Apr 18, 2020

yuanming-hu commented Apr 18, 2020

archibate commented Apr 18, 2020 •

edited

Loading

archibate commented Apr 18, 2020 •

edited

Loading

yuanming-hu commented Apr 18, 2020

yuanming-hu commented Apr 18, 2020

archibate commented Apr 18, 2020 •

edited

Loading

archibate commented Apr 18, 2020

archibate commented Apr 18, 2020 •

edited

Loading

archibate commented Apr 18, 2020 •

edited

Loading

archibate commented Apr 18, 2020

yuanming-hu commented Apr 18, 2020

archibate commented Apr 19, 2020

archibate commented Apr 19, 2020

archibate commented Apr 19, 2020 •

edited

Loading

archibate commented Apr 19, 2020 •

edited

Loading

KLozes commented Apr 19, 2020 •

edited

Loading

[Example] add N-body of O(N log N) make use of sparse #814

[Example] add N-body of O(N log N) make use of sparse #814

Comments

archibate commented Apr 18, 2020

archibate commented Apr 18, 2020 • edited Loading

archibate commented Apr 18, 2020 • edited Loading

archibate commented Apr 18, 2020

yuanming-hu commented Apr 18, 2020

archibate commented Apr 18, 2020 • edited Loading

archibate commented Apr 18, 2020 • edited Loading

yuanming-hu commented Apr 18, 2020

yuanming-hu commented Apr 18, 2020

archibate commented Apr 18, 2020 • edited Loading

archibate commented Apr 18, 2020

archibate commented Apr 18, 2020 • edited Loading

archibate commented Apr 18, 2020 • edited Loading

archibate commented Apr 18, 2020

yuanming-hu commented Apr 18, 2020

archibate commented Apr 19, 2020

archibate commented Apr 19, 2020

archibate commented Apr 19, 2020 • edited Loading

archibate commented Apr 19, 2020 • edited Loading

KLozes commented Apr 19, 2020 • edited Loading

archibate commented Apr 18, 2020 •

edited

Loading

archibate commented Apr 18, 2020 •

edited

Loading

archibate commented Apr 18, 2020 •

edited

Loading

archibate commented Apr 18, 2020 •

edited

Loading

archibate commented Apr 18, 2020 •

edited

Loading

archibate commented Apr 18, 2020 •

edited

Loading

archibate commented Apr 18, 2020 •

edited

Loading

archibate commented Apr 19, 2020 •

edited

Loading

archibate commented Apr 19, 2020 •

edited

Loading

KLozes commented Apr 19, 2020 •

edited

Loading