Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Example] add N-body of O(N log N) make use of sparse #814

Closed
archibate opened this issue Apr 18, 2020 · 19 comments · Fixed by #1440
Closed

[Example] add N-body of O(N log N) make use of sparse #814

archibate opened this issue Apr 18, 2020 · 19 comments · Fixed by #1440
Assignees
Labels
feature request Suggest an idea on this project

Comments

@archibate
Copy link
Collaborator

Concisely describe the proposed feature

Describe the solution you'd like (if any)

Additional comments

@archibate archibate added the feature request Suggest an idea on this project label Apr 18, 2020
@archibate archibate self-assigned this Apr 18, 2020
@archibate
Copy link
Collaborator Author

archibate commented Apr 18, 2020

@archibate
Copy link
Collaborator Author

archibate commented Apr 18, 2020

#include <GL/glut.h>
#include <glm/glm.hpp>
#include <glm/gtc/type_ptr.hpp>
#include <vector>
#include <array>
#include <cstdio>
#include <cmath>

const int RES = 512;
const int IVAL = 1000.0 / 300;
const float SIZ = 1.2;
const float DT = 0.001;
const float SPED = 1.0;
const float GRAV = 1.0;

inline glm::vec2 gravity(glm::vec2 dis, float mass)
{
    auto d = glm::length(dis);
    if (d < 1e-5) return glm::vec2(0.0);
    auto sc = mass / (d * d * d);
    return dis * sc;
}

struct Particle {
    glm::vec2 pos, vel;
    float mass;
    bool background;

    Particle(glm::vec2 &&pos, glm::vec2 &&vel,
            float mass = 0.0, bool background = false)
        : pos(pos), vel(vel * SPED * DT)
        , mass(mass * GRAV * DT * DT)
        , background(background)
    {}

    void render(void) const
    {
        glVertex3fv(glm::value_ptr(pos));
    }
};

std::vector<Particle> P;

int bv2i(glm::bvec2 const &b)
{
    return int(b.x) + int(b.y) * 2;
}

glm::vec2 i2bvf(int i)
{
    return glm::vec2(i % 2, i / 2);
}

struct Quadtree {

    glm::vec2 topleft;
    float size;
    int par{-1};
    std::array<Quadtree *, 4> ch;

    constexpr glm::vec2 fullsize() const
    {
        return glm::vec2(size);
    }

    constexpr glm::vec2 halfsize() const
    {
        return glm::vec2(size * 0.5);
    }

    float mass() const
    {
        float m = 0;
        for (auto &&c: ch) {
            if (c) m += c->mass();
        }
        if (par != -1) m += P[par].mass;
        return m;
    }

    glm::vec2 mass_center() const
    {
        glm::vec2 p(0.0);
        float m = 0;
        for (auto &&c: ch) {
            if (!c) continue;
            auto ma = c->mass();
            p += ma * c->mass_center();
            m += ma;
        }
        if (par != -1) {
            m += P[par].mass;
            p += P[par].pos * P[par].mass;
        }
        return m == 0.0 ? this->center() : p / m;
    }

    glm::vec2 center() const
    {
        return topleft + halfsize();
    }

    float factor(glm::vec2 const &pos) const
    {
        auto d = this->center() - pos;
        return size / glm::length(d);
    }

    glm::vec2 visit(glm::vec2 const &pos, int depth = 0) const
    {
        float fac = 0;
        glm::vec2 acc(0.0);
        for (auto &&c: ch) {
            if (!c) continue;
            auto f = c->factor(pos);
            if (f >= 1)
                acc += c->visit(pos);
            else
                acc += gravity(c->mass_center() - pos, c->mass());
        }
        if (par != -1) {
            acc += gravity(P[par].pos - pos, P[par].mass);
        }
        return acc;
    }

    bool in_range(glm::vec2 const &pos) const
    {
        return glm::all(glm::greaterThanEqual(pos, topleft))
            && glm::all(glm::lessThan(pos, topleft + fullsize()));
    }

    glm::bvec2 which_range(glm::vec2 const &pos) const
    {
        return glm::greaterThanEqual(pos, center());
    }

    void build(std::vector<int> const &ps, int depth = 0)
    {
        if (ps.size() == 0) return;
        if (ps.size() == 1) {
            par = ps[0];
            return;
        }

        std::vector<int> npss[4];
        for (auto &&p: ps) {
            if (!in_range(P[p].pos)) continue;
            int idx = bv2i(which_range(P[p].pos));
            npss[idx].push_back(p);
        }
        for (int i = 0; i < 4; i++) {
            if (npss[i].size()) {
                auto b = i2bvf(i);
                ch[i] = new Quadtree{topleft + halfsize() * b, size * 0.5f};
                ch[i]->build(npss[i], depth + 1);
            }
        }
    }

    void render(int depth = 0)
    {
        glLineWidth(par != -1 ? 2 : 1);
        glColor4f(par != -1 ? 0.9 : 0.3, 0.0, 0.0, 0.1);
        glBegin(GL_LINE_LOOP);
        glVertex2f(topleft.x, topleft.y);
        glVertex2f(topleft.x + size, topleft.y);
        glVertex2f(topleft.x + size, topleft.y + size);
        glVertex2f(topleft.x, topleft.y + size);
        glEnd();
        for (auto &&c: ch) {
            if (c) c->render(depth + 1);
        }
    }
};

Quadtree *qt;

void build_tree(void)
{
    qt = new Quadtree{glm::vec2(0.0), SIZ};
    std::vector<int> ps;
    for (int i = 0; i < P.size(); i++) {
        ps.push_back(i);
    }
    qt->build(ps);
}

glm::vec2 field(glm::vec2 const &pos)
{
    return qt->visit(pos);  /* comment this line for O(N*N) computation */
    glm::vec2 acc(0.0);
    for (auto &&p: P) {
        acc += gravity(p.pos - pos, p.mass);
    }
    return acc;
}

void advance(void)
{
    build_tree();
    float dt = (rand() / RAND_MAX) * 0.7 + 0.4;
    for (auto &&p: P) {
        if (p.background)
            continue;
        auto acc = field(p.pos);
        p.vel += acc * dt;
    }
    for (auto &&p: P) {
        p.pos += p.vel * dt;
    }
}

void init(void)
{
#if 1
    P.push_back(Particle(
            glm::vec2(0.47, 0.50),
            glm::vec2(0.00, 0.40),
            0.050));
    P.push_back(Particle(
            glm::vec2(0.53, 0.50),
            glm::vec2(0.00, -0.40),
            0.050));
    P.push_back(Particle(
            glm::vec2(1.10, 0.71),
            glm::vec2(0.00, 0.27),
            0.002));
#else
    P.push_back(Particle(
            glm::vec2(0.50, 0.44),
            glm::vec2(0.00, 0.00),
            0.050));
    P.push_back(Particle(
            glm::vec2(0.50, 0.56),
            glm::vec2(0.10, 0.00),
            0.050));
#endif

    float mass = 0.0;
    glm::vec2 adjust(0.0);
    for (auto &&p: P) {
        mass += p.mass;
        adjust += p.vel * p.mass;
    }
    adjust /= mass;
    for (auto &&p: P) {
        if (p.mass != 0.0)
            p.vel -= adjust;
    }
}

void display(void)
{
    glEnable(GL_BLEND);
    glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
    glColor4f(0.0, 0.0, 0.0, 0.05);
    glRectf(0.0, 0.0, SIZ, SIZ);
    glColor4f(1.0, 1.0, 1.0, 1.0);
    glPointSize(3);
    glBegin(GL_POINTS);
    for (auto &&p: P) {
        p.render();
    }
    glEnd();
    if (qt) qt->render();
    glFlush();
}

void timer(int id)
{
    advance();
    advance();
    advance();
    advance();
    advance();
    glutPostRedisplay();
    glutTimerFunc(IVAL, timer, 1);
}

void keyboard(unsigned char key, int x, int y)
{
    if (key == 27)
        exit(0);
}

int main(int argc, char **argv)
{
    glutInit(&argc, argv);

    glutInitDisplayMode(GLUT_SINGLE | GLUT_RGB);
    glutInitWindowSize(RES, RES);
    glutCreateWindow("N-body");

    glutKeyboardFunc(keyboard);
    glutDisplayFunc(display);
    glutTimerFunc(IVAL, timer, 1);

    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    gluOrtho2D(0.0, SIZ, 0.0, SIZ);
    glShadeModel(GL_FLAT);

    init();

    glutMainLoop();
}

@archibate
Copy link
Collaborator Author

@yuanming-hu could you write a doc on sparse data structures? So that I know how to make use of it to make octrees?

@yuanming-hu
Copy link
Member

Right, I should do that once I get a chance...

@archibate
Copy link
Collaborator Author

archibate commented Apr 18, 2020

C++ version is done now, have fun! You may notice that the precession is gone after we uncomment the line return qt->visit(pos);, this means we did ignored the binary star's size now, and thus we are on O(n*log n) now!
To make precession back again and thus make the result precise as O(n*n) does, we may extend Quadtree::mass to Quadtree::quadpole, finally when move it to taichi, we should obtain a python version of https://github.com/mockingbirdnest/Principia and hopefully more better performance :)
Anyway, no rush at all, this is after v0.6.0 anyway... Hopefully I will write an article on this n-body example to advertise taichi...definitely we will attract a lot physics people!

@archibate
Copy link
Collaborator Author

archibate commented Apr 18, 2020

I found a bug when writing python n-body on ti.x64:

@ti.kernel
def adjust_momentum():
    adj = ti.Vector([0.0, 0.0])
    mac = 0.0
    for i in pos:
        mac += mass[i]
        adj += vel[i] * mac
    adj = adj / mac
    for i in pos:
        vel[i] += adj

obtains:

[I 04/19/20 00:42:15.293] [compile_to_offloads.cpp:operator()@17] Constant folded:
kernel {
  <f32 x1> $0 = alloca
  <f32 x1> $1 = const [0.0]
  <f32 x1> $2 : local store [$0 <- $1]
  <f32 x1> $3 = alloca
  <f32 x1> $4 : local store [$3 <- $1]
  <f32 x1> $5 = alloca
  <f32 x1> $6 : local store [$5 <- $1]
  <i32 x1> $7 = alloca
  <i32 x1> $8 = const [0]
  <i32 x1> $9 = const [4]
  $10 : for $7 in range($8, $9, step 1) {
    <i32 x1> $11 = local load [ [$7[0]]]
    <i32 x1> $12 = const [-1]
    <i32 x1> $13 = bit_extract($11 + 0, 0~2)
    <i32 x1> $14 = const [3]
    <i32 x1> $15 = cmp_lt $13 $14
    <i32 x1> $16 = bit_and $12 $15
    $17 : if $16 {
      <f32*x1> $18 = global ptr [S8place_f32], index [$13] activate=false
      <gen*x1> $19 = get root
      <i32 x1> $20 = const [0]
      <gen*x1> $21 = [S0root][root]::lookup($19, $20) activate = false
      <gen*x1> $22 = get child [S0root->S7dense] $21
      <i32 x1> $23 = bit_extract($13 + 0, 0~2)
      <i32 x1> $24 = const [1]
      <i32 x1> $25 = mul $23 $24
      <i32 x1> $26 = add $20 $25
      <gen*x1> $27 = [S7dense][dense]::lookup($22, $26) activate = false
      <f32*x1> $28 = get child [S7dense->S8place_f32] $27
      <f32 x1> $29 = global load $28
      <f32 x1> $30 = atomic add($5, $29)
      <f32*x1> $31 = global ptr [S5place_f32], index [$13] activate=false
      <gen*x1> $32 = get child [S0root->S4dense] $21
      <gen*x1> $33 = [S4dense][dense]::lookup($32, $26) activate = false
      <f32*x1> $34 = get child [S4dense->S5place_f32] $33
      <f32 x1> $35 = global load $34
      <f32 x1> $36 = local load [ [$5[0]]]
      <f32 x1> $37 = mul $35 $36
      <f32*x1> $38 = global ptr [S6place_f32], index [$13] activate=false
      <f32*x1> $39 = get child [S4dense->S6place_f32] $33
      <f32 x1> $40 = global load $39
      <f32 x1> $41 = mul $40 $36
      <f32 x1> $42 = atomic add($0, $37)
      <f32 x1> $43 = atomic add($3, $41)
    }
  }
  <f32 x1> $44 = local load [ [$0[0]]]
  <f32 x1> $45 = local load [ [$5[0]]]
  <f32 x1> $46 = div $44 $45
  <f32 x1> $47 = local load [ [$3[0]]]
  <f32 x1> $48 = div $47 $45
  <i32 x1> $49 = alloca
  $50 : for $49 in range($8, $9, step 1) {
    <i32 x1> $51 = local load [ [$49[0]]]
    <i32 x1> $52 = const [-1]
    <i32 x1> $53 = bit_extract($51 + 0, 0~2)
    <i32 x1> $54 = const [3]
    <i32 x1> $55 = cmp_lt $53 $54
    <i32 x1> $56 = bit_and $52 $55
    $57 : if $56 {
      <f32*x1> $58 = global ptr [S5place_f32], index [$53] activate=false
      <gen*x1> $59 = get root
      <i32 x1> $60 = const [0]
      <gen*x1> $61 = [S0root][root]::lookup($59, $60) activate = false
      <gen*x1> $62 = get child [S0root->S4dense] $61
      <i32 x1> $63 = bit_extract($53 + 0, 0~2)
      <i32 x1> $64 = const [1]
      <i32 x1> $65 = mul $63 $64
      <i32 x1> $66 = add $60 $65
      <gen*x1> $67 = [S4dense][dense]::lookup($62, $66) activate = false
      <f32*x1> $68 = get child [S4dense->S5place_f32] $67
      <f32 x1> $69 = atomic add($68, $46)
      <f32*x1> $70 = global ptr [S6place_f32], index [$53] activate=false
      <f32*x1> $71 = get child [S4dense->S6place_f32] $67
      <f32 x1> $72 = atomic add($71, $48)
    }
  }
}
[I 04/19/20 00:42:15.294] [compile_to_offloads.cpp:operator()@17] Offloaded:
kernel {
  $0 = offloaded  {
    <f32*x1> $1 = global tmp var (offset = 4 B)
    <f32 x1> $2 = const [0.0]
    <f32*x1> $3 : global store [$1 <- $2]
    <f32 x1> $4 = const [0.0]
    <f32*x1> $5 = global tmp var (offset = 4 B)
    <f32 x1> $6 = const [0.0]
    <f32*x1> $7 : global store [$5 <- $6]
    <f32*x1> $8 = global tmp var (offset = 8 B)
    <f32 x1> $9 = const [0.0]
    <f32*x1> $10 : global store [$8 <- $9]
    <f32*x1> $11 = global tmp var (offset = 8 B)
    <f32 x1> $12 = const [0.0]
    <f32*x1> $13 : global store [$11 <- $12]
    <f32*x1> $14 = global tmp var (offset = 0 B)
    <f32 x1> $15 = const [0.0]
    <f32*x1> $16 : global store [$14 <- $15]
    <f32*x1> $17 = global tmp var (offset = 0 B)
    <f32 x1> $18 = const [0.0]
    <f32*x1> $19 : global store [$17 <- $18]
    <i32 x1> $20 = alloca
    <i32 x1> $21 = const [0]
    <i32 x1> $22 = const [4]
  }
  $23 = offloaded range_for(0, 4) block_dim=adaptive {
    <i32 x1> $24 = loop index 0
    <i32 x1> $25 = const [-1]
    <i32 x1> $26 = bit_extract($24 + 0, 0~2)
    <i32 x1> $27 = const [3]
    <i32 x1> $28 = cmp_lt $26 $27
    <i32 x1> $29 = bit_and $25 $28
    $30 : if $29 {
      <f32*x1> $31 = global ptr [S8place_f32], index [$26] activate=false
      <gen*x1> $32 = get root
      <i32 x1> $33 = const [0]
      <gen*x1> $34 = [S0root][root]::lookup($32, $33) activate = false
      <gen*x1> $35 = get child [S0root->S7dense] $34
      <i32 x1> $36 = bit_extract($26 + 0, 0~2)
      <i32 x1> $37 = const [1]
      <i32 x1> $38 = mul $36 $37
      <i32 x1> $39 = add $33 $38
      <gen*x1> $40 = [S7dense][dense]::lookup($35, $39) activate = false
      <f32*x1> $41 = get child [S7dense->S8place_f32] $40
      <f32 x1> $42 = global load $41
      <f32*x1> $43 = global tmp var (offset = 0 B)
      <f32 x1> $44 = atomic add($43, $42)
      <f32*x1> $45 = global ptr [S5place_f32], index [$26] activate=false
      <gen*x1> $46 = get child [S0root->S4dense] $34
      <gen*x1> $47 = [S4dense][dense]::lookup($46, $39) activate = false
      <f32*x1> $48 = get child [S4dense->S5place_f32] $47
      <f32 x1> $49 = global load $48
      <f32*x1> $50 = global tmp var (offset = 0 B)
      <f32 x1> $51 = global load $50
      <f32 x1> $52 = mul $49 $51
      <f32*x1> $53 = global ptr [S6place_f32], index [$26] activate=false
      <f32*x1> $54 = get child [S4dense->S6place_f32] $47
      <f32 x1> $55 = global load $54
      <f32 x1> $56 = mul $55 $51
      <f32*x1> $57 = global tmp var (offset = 4 B)
      <f32 x1> $58 = atomic add($57, $52)
      <f32*x1> $59 = global tmp var (offset = 8 B)
      <f32 x1> $60 = atomic add($59, $56)
    }
  }
  $61 = offloaded  {
    <f32*x1> $62 = global tmp var (offset = 4 B)
    <f32 x1> $63 = global load $62
    <f32*x1> $64 = global tmp var (offset = 0 B)
    <f32 x1> $65 = global load $64
    <f32 x1> $66 = div $63 $65
    <f32*x1> $67 = global tmp var (offset = 8 B)
    <f32 x1> $68 = global load $67
    <f32 x1> $69 = div $68 $65
    <i32 x1> $70 = alloca
  }
  $71 = offloaded range_for(0, 4) block_dim=adaptive {
    <i32 x1> $72 = loop index 0
    <i32 x1> $73 = const [-1]
    <i32 x1> $74 = bit_extract($72 + 0, 0~2)
    <i32 x1> $75 = const [3]
    <i32 x1> $76 = cmp_lt $74 $75
    <i32 x1> $77 = bit_and $73 $76
    $78 : if $77 {
      <f32*x1> $79 = global ptr [S5place_f32], index [$74] activate=false
      <gen*x1> $80 = get root
      <i32 x1> $81 = const [0]
      <gen*x1> $82 = [S0root][root]::lookup($80, $81) activate = false
      <gen*x1> $83 = get child [S0root->S4dense] $82
      <i32 x1> $84 = bit_extract($74 + 0, 0~2)
      <i32 x1> $85 = const [1]
      <i32 x1> $86 = mul $84 $85
      <i32 x1> $87 = add $81 $86
      <gen*x1> $88 = [S4dense][dense]::lookup($83, $87) activate = false
      <f32*x1> $89 = get child [S4dense->S5place_f32] $88
      <f32 x1> $90 = atomic add($89, $66)
      <f32*x1> $91 = global ptr [S6place_f32], index [$74] activate=false
      <f32*x1> $92 = get child [S4dense->S6place_f32] $88
      <f32 x1> $93 = atomic add($92, $69)
    }
  }
}
[E 04/19/20 00:38:07.480] [verify.cpp:basic_verify@37] stmt 90 cannot have operand 66.

It cannot because $66 exist in the previous offload, and the constant_fold optimized it even if it's crossing offloaded tasks.
Maybe we can move Constant Fold after Offloaded? @xumingkuan @yuanming-hu

In additional, if I change it to vel[i] -= adj, no error, but giving bad adjust result. (momentum not cleared, and became bigger than before)
I'll try to write a minimal example to reproduce this bug.

@yuanming-hu
Copy link
Member

Right, this is indeed a bug, being fixed in #813...

@yuanming-hu
Copy link
Member

I merged #813 in. Could you retry with the latest commit?

@archibate
Copy link
Collaborator Author

archibate commented Apr 18, 2020

Great! Building now, but can you tell why -= doesn't have that error while += has? Did we treat them differently somewhere?

@archibate
Copy link
Collaborator Author

After merging with latest commit, the issue still exists...

@archibate
Copy link
Collaborator Author

archibate commented Apr 18, 2020

I realized this is not about constant_fold.

<f32 x1> $67 = local load [ [$0[0]]]
  <f32 x1> $68 = local load [ [$5[0]]]
  <f32 x1> $69 = div $67 $68
  <f32 x1> $70 : local store [$0 <- $69]
  <f32 x1> $71 = local load [ [$3[0]]]
  <f32 x1> $72 = div $71 $68
  <f32 x1> $73 : local store [$3 <- $72]
  <i32 x1> $74 = alloca

after Simplified II:

<f32 x1> $44 = local load [ [$0[0]]]
  <f32 x1> $45 = local load [ [$5[0]]]
  <f32 x1> $46 = div $44 $45
  <f32 x1> $47 = local load [ [$3[0]]]
  <f32 x1> $48 = div $47 $45
  <i32 x1> $49 = alloca

The local store was eliminated! Why? Can we do full_simplify after offloaded? You don't want to bother offload bounderies.

@archibate
Copy link
Collaborator Author

archibate commented Apr 18, 2020

I removed full_simplify and compilation passed.
Now things gets more interesting:

@ti.kernel
def adjust_momentum():
    adj = ti.Vector([0.0, 0.0])
    mac = 0.0
    for i in vel:
        mac += mass[i]
        adj += vel[i] * mac
    adj /= mac
    print(adj[0])
    print(adj[1])
    for i in pos:
        print(adj[0])
        print(adj[1])
        vel[i] += adj

obtains:

[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000466
[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000466
[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000466
[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000466

If I change vel[i] += adj to vel[i] -= adj:

[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000074
[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000074
[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000074
[debug] adj[0] = 0.000000
[debug] adj[1] = 0.000074

Schrodinger's cat?? Why changing codes after a line can affect that line???

@archibate
Copy link
Collaborator Author

SIZ = 1.2
DT = 0.001

ti.init(arch=ti.x64)

N = 3
pos = ti.Vector(2, dt=ti.f32, shape=N)
vel = ti.Vector(2, dt=ti.f32, shape=N)
mass = ti.var(dt=ti.f32, shape=N)

@ti.func
def gravity(dis, m):
    d = dis.norm()
    ret = ti.Vector([0.0, 0.0])
    if d >= 1e-5:
        sc = m / d ** 3
        ret = dis * sc
    return ret

@ti.func
def field(p):
    acc = ti.Vector([0.0, 0.0])
    for i in range(N):
        acc += gravity(pos[i] - p, mass[i])
    return acc

@ti.kernel
def advance():
    dt = ti.random() * 0.7 + 0.4
    for i in pos:
        acc = field(pos[i])
        vel[i] += acc * dt
    for i in pos:
        pos[i] += vel[i] * dt


@ti.kernel
def adjust_momentum():
    adj = ti.Vector([0.0, 0.0])
    mac = 0.0
    for i in vel:
        mac += mass[i]
        adj += vel[i] * mac
    adj /= mac
    print(adj[0])
    print(adj[1])
    for i in pos:
        print(adj[0])
        print(adj[1])
        vel[i] += adj

def initialize():
    pos.from_numpy(np.array([0.47, 0.50, 0.53, 0.50, 1.10, 0.71]).reshape(N, 2))
    vel.from_numpy(np.array([0.00, 0.40, 0.00, -0.4, 0.00, 0.27]).reshape(N, 2) * DT)
    mass.from_numpy(np.array([0.050, 0.050, 0.000]) * DT ** 2)


initialize()
adjust_momentum()

gui = ti.GUI('N-body')
while not gui.get_event(ti.GUI.PRESS):
    for i in range(5):
        _pos = pos.to_numpy()
        gui.circles(_pos / SIZ, radius=5, color=0x333333 * i)
        for i in range(5):
            advance()
    gui.show()

Have bug fun!

@yuanming-hu
Copy link
Member

Thanks for sharing this. There's actually a racing condition (which is undefined behavior in Taichi) in this piece of code and I'm not surprised that you are getting weird behaviors.

    adj = ti.Vector([0.0, 0.0])
    mac = 0.0
    for i in vel:
        mac += mass[i]
        adj += vel[i] * mac

Note that mac is read and written in parallel.

I believe you meant

    adj = ti.Vector([0.0, 0.0])
    mac = 0.0
    for i in vel: # Note this loop is parallel
        mac += mass[i]
        adj += vel[i] * mass[i] # instead of mac

@archibate
Copy link
Collaborator Author

Another bug:

@ti.kernel
def advance():
    for i in range(N):
        n = ti.cast(ti.max(vel[i].norm() / VEL0 + 0.5, 1), ti.i32)
        cnt[i] = n
    for i in range(N):
        dt = 1 / cnt[i]
        for t in range(cnt[i]):
            acc[i] = field(pos[i])
            print(acc[i][0] * INV_DT ** 2)
            for s in range(1):
                vel[i] += acc[i] * dt
                pos[i] += vel[i] * dt

obtains:

[debug] acc[i][0] * INV_DT ** 2 = 0.000000
[debug] acc[i][0] * INV_DT ** 2 = 0.000000
[debug] acc[i][0] * INV_DT ** 2 = 0.000000

If I replace for s in range(1): by if 1:, it becomes normal:

[debug] acc[i][0] * INV_DT ** 2 = 0.000000
[debug] acc[i][0] * INV_DT ** 2 = -0.211246
[debug] acc[i][0] * INV_DT ** 2 = -0.211451
[debug] acc[i][0] * INV_DT ** 2 = 0.000000
[debug] acc[i][0] * INV_DT ** 2 = -0.211657
[debug] acc[i][0] * INV_DT ** 2 = -0.211862

@archibate
Copy link
Collaborator Author

@yuanming-hu I suddenly realized that the differentiable programming may help us to calculate gravity force vector from potential energy!!!

@archibate
Copy link
Collaborator Author

archibate commented Apr 19, 2020

import taichi as ti
import numpy as np
from random import Random

rand_gen = Random()

SIZ = 1.2
DT = 0.0005
INV_DT = 1 / DT
INV_VEL0 = INV_DT / 0.2
INV_ACC0 = INV_DT ** 2 / 0.2
MAX_LOSS = 2 ** 14
MIN_DIS3 = 4e-7

ti.init(arch=ti.cuda)#, print_ir=True)

N = 32
T = 9
pos = ti.Vector(2, dt=ti.f32)
acc = ti.Vector(2, dt=ti.f32)
vel = ti.Vector(2, dt=ti.f32)
mass = ti.var(dt=ti.f32)
pot = ti.var(dt=ti.i32)
max_pot = ti.var(dt=ti.i32)

d_x = ti.root.dense(ti.i, N)
b_x = ti.root.bitmasked(ti.i, N)
d_x.place(pos, vel, pot, mass)
b_x.place(acc)
ti.root.place(max_pot)

@ti.func
def gravity(dis, m):
    d = dis.norm()
    ret = ti.Vector([0.0, 0.0])
    ret = dis * (m / (MIN_DIS3 + d ** 3))
    return ret

@ti.func
def field(p):
    a = ti.Vector([0.0, 0.0])
    for i in range(N):
        a += gravity(pos[i] - p, mass[i])
    return a

@ti.kernel
def calc_pot():
    max_pot[None] = 0
    for i in range(N):
        loss = acc[i].norm() * INV_ACC0
        loss += vel[i].norm() * INV_VEL0
        n = ti.min(ti.max(loss, 1), MAX_LOSS)
        pot[i] = ti.cast(ti.log(n) / ti.log(2.0) + 0.5, ti.i32)
        ti.atomic_max(max_pot[None], pot[i])

@ti.kernel
def advance_sub(seq: ti.i32):#, dt0: ti.f32):
    for i in range(N):
        p = max_pot[None] - pot[i]
        tp = 2 ** p
        if seq % tp:
            ti.deactivate(b_x, i)
            continue
        acc[i] = field(pos[i])

    for i in acc:
        steps = 2 ** pot[i]
        dt = 1 / steps
        vel[i] += acc[i] * dt
        pos[i] += vel[i] * dt

@ti.kernel
def bouncing_wall():
    for i in range(N):
        for s in ti.static(range(2)):
            if pos[i][s] >= SIZ and vel[i][s] > 0:
                vel[i][s] = -vel[i][s]
            elif pos[i][s] <= 0 and vel[i][s] < 0:
                vel[i][s] = -vel[i][s]

def advance():
        calc_pot()
        max_seq = 2 ** max_pot[None]
        for seq in range(max_seq):
            advance_sub(seq)#, rand_gen.gauss(1, 0.1))
        bouncing_wall()

@ti.kernel
def adjust_momentum():
    mac = 0.0
    adj = ti.Vector([0.0, 0.0])
    for i in range(N):
        mac += mass[i]
        adj += vel[i] * mass[i]
    adj /= mac
    for i in range(N):
        vel[i] -= adj

@ti.kernel
def calc_energy():
    e = 0.0
    for i in range(N):
        e += mass[i] * vel[i].norm_sqr()
    for i in range(N):
        phi = 0.0
        for j in range(N):
            if j == i: continue
            d = pos[i] - pos[j]
            phi += mass[j] / d.norm()
        e -= phi * mass[i]
    e *= INV_DT ** 4
    print(e)

@ti.kernel
def calc_momentum():
    p = ti.Vector([0.0, 0.0])
    for i in range(N):
        p += vel[i] * mass[i]
    p *= INV_DT ** 3
    print(p[0])
    print(p[1])

@ti.kernel
def initialize():
    for i in range(N):
        pos[i] = ti.Vector([ti.random(), ti.random()]) * SIZ
        vel[i] = (ti.Vector([ti.random(), ti.random()]) - 0.5) * 6.0 * DT
        mass[i] = 0.1 * ti.random() * DT ** 2


initialize()
#adjust_momentum()

gui = ti.GUI('N-body')
for frame in range(100000):
    if gui.get_event(ti.GUI.PRESS):
        break
    for i in range(4):
        advance()
    _pos = pos.to_numpy()
    gui.circles(_pos / SIZ, radius=2)
    gui.show()
    if frame % 32 == 0:
        calc_energy()

Update: dispatch more computation resource to those bodies in deeper gravity well so that energy is conserved. (i.e. dt is smaller for those particles whose acc are big)
Not sure if I'm using bitmasked correctly...

@archibate
Copy link
Collaborator Author

archibate commented Apr 19, 2020

How to use pointer? Could you write a simple sparse quadtree example for me? It would be very helpful, thank you!
BUG again: triple embbed for-loop not working.
Gonna sleep now, and thinking if we want a seperate issue for this BUG.

@KLozes
Copy link
Collaborator

KLozes commented Apr 19, 2020

@archibate I think @yuanming-hu 's Taichi paper would be a good place to start reading about storing particles in sparse data structures. http://taichi.graphics/wp-content/uploads/2019/09/taichi_lang.pdf . You can look at the taichi_sparse.py example for the new syntax.

Also, you probably would not want to create a quadtree directly from taichi pointer data structures. Instead, you want to create a pyramid of sparse grids with increasing resolution to emulate an quadtree. An example of this is in the mgpcg.py example.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
feature request Suggest an idea on this project
Projects
None yet
Development

Successfully merging a pull request may close this issue.

3 participants