Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RNN classification example #19

Closed
mazko opened this issue Mar 5, 2019 · 6 comments
Closed

RNN classification example #19

mazko opened this issue Mar 5, 2019 · 6 comments

Comments

@mazko
Copy link

mazko commented Mar 5, 2019

When classify a sequence, we would like the network to have one output, instead of a sequence of output. According to 01user.md to classify a sequence kad_avg was mentioned. I tried this on mnist. It works but
i am not sure how to train such network. During training process we don't even know output values other then last one. In this line memcpy(&y[k][b * d->n_out], d->y[s], d->n_out * sizeof(float)); each y in sequence of output has same value d->y[s] which looks strange.

#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "kann_extra/kann_data.h"
#include "kann.h"

typedef struct {
  int n_in, n_out, ulen, n;
  float **x, **y;
} train_data;

static void train(kann_t *ann, train_data *d, float lr, int mini_size, int max_epoch, const char *fn, int n_threads)
{
  float **x, **y, *r, best_cost = 1e30f;
  int epoch, j, n_var, *shuf;
  kann_t *ua;

  n_var = kann_size_var(ann);
  r = (float*)calloc(n_var, sizeof(float));
  x = (float**)malloc(d->ulen * sizeof(float*));
  y = (float**)malloc(d->ulen * sizeof(float*));
  for (j = 0; j < d->ulen; ++j) {
    x[j] = (float*)calloc(mini_size * d->n_in, sizeof(float));
    y[j] = (float*)calloc(mini_size * d->n_out, sizeof(float));
  }
  shuf = (int*)calloc(d->n, sizeof(int));

  ua = kann_unroll(ann, d->ulen);
  kann_set_batch_size(ua, mini_size);
  kann_mt(ua, n_threads, mini_size);
  kann_feed_bind(ua, KANN_F_IN,    0, x);
  kann_feed_bind(ua, KANN_F_TRUTH, 0, y);
  kann_switch(ua, 1);
  for (epoch = 0; epoch < max_epoch; ++epoch) {
    kann_shuffle(d->n, shuf);
    double cost = 0.0;
    int tot = 0, tot_base = 0, n_cerr = 0;
    for (j = 0; j < d->n - mini_size; j += mini_size) {
      int b, k;
      for (k = 0; k < d->ulen; ++k) {
        for (b = 0; b < mini_size; ++b) {
          int s = shuf[j + b];
          memcpy(&x[k][b * d->n_in], &d->x[s][k * d->n_in], d->n_in * sizeof(float));
          memcpy(&y[k][b * d->n_out], d->y[s], d->n_out * sizeof(float));
        }
      }
      cost += kann_cost(ua, 0, 1) * d->ulen * mini_size;
      n_cerr += kann_class_error(ua, &k);
      tot_base += k;
      //kad_check_grad(ua->n, ua->v, ua->n-1);
      kann_RMSprop(n_var, lr, 0, 0.9f, ua->g, ua->x, r);
      tot += d->ulen * mini_size;
    }
    if (cost < best_cost) {
      best_cost = cost;
      if (fn) kann_save(fn, ann);
    }
    fprintf(stderr, "epoch: %d; cost: %g (class error: %.2f%%)\n", epoch+1, cost / tot, 100.0f * n_cerr / tot_base);
  }

  kann_delete_unrolled(ua);

  for (j = 0; j < d->ulen; ++j) {
    free(y[j]); free(x[j]);
  }
  free(y); free(x); free(r); free(shuf);
}

static train_data* create_train_data(kann_t *ann, kann_data_t *x, kann_data_t *y)
{
  train_data *d;
  d = (train_data*)malloc(sizeof(*d));
  assert(d);
  assert(x->n_row == y->n_row);
  d->x = x->x;
  d->y = y->x;
  d->ulen = 28; // 28x28
  d->n = x->n_row;
  d->n_in = kann_dim_in(ann);
  d->n_out = kann_dim_out(ann);
  return d;
}

int main(int argc, char *argv[])
{
  kann_t *ann;
  kann_data_t *x, *y;
  char *fn_in = 0, *fn_out = 0;
  int c, i, mini_size = 64, max_epoch = 50, seed = 84, n_h_layers = 1, n_h_neurons = 64, norm = 1, n_threads = 1;
  float lr = 0.001f, dropout = 0.2f;

  while ((c = getopt(argc, argv, "i:o:m:l:n:d:s:t:N")) >= 0) {
    if (c == 'i') fn_in = optarg;
    else if (c == 'o') fn_out = optarg;
    else if (c == 'm') max_epoch = atoi(optarg);
    else if (c == 'l') n_h_layers = atoi(optarg);
    else if (c == 'n') n_h_neurons = atoi(optarg);
    else if (c == 'd') dropout = atof(optarg);
    else if (c == 's') seed = atoi(optarg);
    else if (c == 't') n_threads = atoi(optarg);
    else if (c == 'N') norm = 0;
  }

  if (argc - optind == 0 || (argc - optind == 1 && fn_in == 0)) {
    FILE *fp = stdout;
    fprintf(fp, "Usage: mnist-cnn [-i model] [-o model] [-t nThreads] <x.knd> [y.knd]\n");
    return 1;
  }

  kad_trap_fe();
  kann_srand(seed);
  if (fn_in) {
    ann = kann_load(fn_in);
  } else {
    kad_node_t *t;
    int rnn_flag = KANN_RNN_VAR_H0;
    if (norm) rnn_flag |= KANN_RNN_NORM;
    t = kann_layer_input(28); // 28x28
    for (i = 0; i < n_h_layers; ++i) {
      t = kann_layer_gru(t, n_h_neurons, rnn_flag);
      t = kann_layer_dropout(t, dropout);
    }
    t = kad_avg(1, &t);
    ann = kann_new(kann_layer_cost(t, 10, KANN_C_CEB), 0);
  }

  x = kann_data_read(argv[optind]);
  assert(x->n_col == 28 * 28);
  y = argc - optind >= 2? kann_data_read(argv[optind+1]) : 0;

  if (y) { // training
    assert(y->n_col == 10);
    if (n_threads > 1) kann_mt(ann, n_threads, mini_size);
    train_data *d;
    d = create_train_data(ann, x, y);
    train(ann, d, lr, mini_size, max_epoch, fn_out, n_threads);
    free(d);
    kann_data_free(y);
  } else { // applying
    int i, j, k, n_out;
    kann_switch(ann, 0);
    n_out = kann_dim_out(ann);
    assert(n_out == 10);
    for (i = 0; i < x->n_row; ++i) {
      const float *y;
      kann_rnn_start(ann);
      for(k = 0; k < 28; ++k) {
        float x1[28];
        memcpy(x1, &x->x[i][k * 28], sizeof(x1));
        y = kann_apply1(ann, x1);
      }
      if (x->rname) printf("%s\t", x->rname[i]);
      for (j = 0; j < n_out; ++j) {
        if (j) putchar('\t');
        printf("%.3g", y[j] + 1.0f - 1.0f);
      }
      putchar('\n');
      kann_rnn_end(ann);
    }
  }

  kann_data_free(x);
  kann_delete(ann);
  return 0;
}

It would be great to see any simple rnn classification example.

@attractivechaos
Copy link
Owner

If I am right, you can replace kad_avg(1, &t) with kad_select(1, &t, -1), which selects the last hidden vector. I haven't touched kann for a while, so I could be wrong...

    kad_node_t *t;
    int rnn_flag = KANN_RNN_VAR_H0;
    if (norm) rnn_flag |= KANN_RNN_NORM;
    t = kann_layer_input(28); // 28x28
    for (i = 0; i < n_h_layers; ++i) {
      t = kann_layer_gru(t, n_h_neurons, rnn_flag);
      t = kann_layer_dropout(t, dropout);
    }
    // replace "t = kad_avg(1, &t);" with:
    t = kad_select(1, &t, -1);
    ann = kann_new(kann_layer_cost(t, 10, KANN_C_CEB), 0);

@attractivechaos
Copy link
Owner

attractivechaos commented Mar 5, 2019

i am not sure how to train such network.

When you use kad_avg() or kad_select(), there will be a single output of shape (batch,10) in case of MNIST (independent of unrolled length ulen). The following part needs to be changed to:

  n_var = kann_size_var(ann);
  r = (float*)calloc(n_var, sizeof(float));
  x = (float**)malloc(d->ulen * sizeof(float*));
  y = (float**)malloc(1 * sizeof(float*));
  for (j = 0; j < d->ulen; ++j)
    x[j] = (float*)calloc(mini_size * d->n_in, sizeof(float));
  y[0] = (float*)calloc(mini_size * d->n_out, sizeof(float));
  shuf = (int*)calloc(d->n, sizeof(int));

each y in sequence of output has same value d->y[s]

KANN only uses the first d->y[0], so your training is still successful.

It would be great to see any simple rnn classification example.

Yes, indeed. I should add such an example at some point...

@mazko
Copy link
Author

mazko commented Mar 10, 2019

Thank you for clarifying. I changed a little bit train function according to your suggestions.

static void train(kann_t *ann, train_data *d, float lr, int mini_size, int max_epoch, const char *fn, int n_threads)
{
  float **x, **y, *r, best_cost = 1e30f;
  int epoch, j, n_var, *shuf;
  kann_t *ua;

  n_var = kann_size_var(ann);
  r = (float*)calloc(n_var, sizeof(float));
  x = (float**)malloc(d->ulen * sizeof(float*));
  y = (float**)malloc(1 * sizeof(float*));
  for (j = 0; j < d->ulen; ++j) {
    x[j] = (float*)calloc(mini_size * d->n_in, sizeof(float));
  }
  y[0] = (float*)calloc(mini_size * d->n_out, sizeof(float));
  shuf = (int*)calloc(d->n, sizeof(int));

  ua = kann_unroll(ann, d->ulen);
  kann_set_batch_size(ua, mini_size);
  kann_mt(ua, n_threads, mini_size);
  kann_feed_bind(ua, KANN_F_IN,    0, x);
  kann_feed_bind(ua, KANN_F_TRUTH, 0, y);
  kann_switch(ua, 1);
  for (epoch = 0; epoch < max_epoch; ++epoch) {
    kann_shuffle(d->n, shuf);
    double cost = 0.0;
    int tot = 0, tot_base = 0, n_cerr = 0;
    for (j = 0; j < d->n - mini_size; j += mini_size) {
      int b, k;
      for (b = 0; b < mini_size; ++b) {
        int s = shuf[j + b];
        for (k = 0; k < d->ulen; ++k) {
          memcpy(&x[k][b * d->n_in], &d->x[s][k * d->n_in], d->n_in * sizeof(float));
        }
        memcpy(&y[0][b * d->n_out], d->y[s], d->n_out * sizeof(float));
      }
      cost += kann_cost(ua, 0, 1) * d->ulen * mini_size;
      n_cerr += kann_class_error(ua, &k);
      tot_base += k;
      //kad_check_grad(ua->n, ua->v, ua->n-1);
      kann_RMSprop(n_var, lr, 0, 0.9f, ua->g, ua->x, r);
      tot += d->ulen * mini_size;
    }
    if (cost < best_cost) {
      best_cost = cost;
      if (fn) kann_save(fn, ann);
    }
    fprintf(stderr, "epoch: %d; cost: %g (class error: %.2f%%)\n", epoch+1, cost / tot, 100.0f * n_cerr / tot_base);
  }

  kann_delete_unrolled(ua);

  for (j = 0; j < d->ulen; ++j) {
    free(x[j]);
  }
  free(y[0]); free(y); free(x); free(r); free(shuf);
}

Both previous and current implementations produce exactly the same mnist-rnn.kan file.

I also tried kad_select instead of kad_avg and it seems first one learns faster on mnist.

~$ ./mnist-rnn -o mnist-rnn.kan kann-data/mnist-train-?.knd
~$ ./mnist-rnn -i mnist-rnn.kan kann-data/mnist-test-x.knd | kann-data/mnist-eval.pl
Error rate: 2.45% # kad_avg(1, &t);
Error rate: 1.42% # kad_select(1, &t, -1);

@mazko mazko closed this as completed Mar 10, 2019
@lh3
Copy link
Collaborator

lh3 commented Mar 10, 2019

Thanks a lot for the confirmation. Do you mind if I modify your code a little bit and add to the "examples/" directory? I will say something like "Code was written by Oleg Mazko (@mazko) and modified by Attractive Chaos". Is that ok?

@mazko
Copy link
Author

mazko commented Mar 10, 2019

Thanks a lot for the confirmation. Do you mind if I modify your code a little bit and add to the "examples/" directory? I will say something like "Code was written by Oleg Mazko (@mazko) and modified by Attractive Chaos". Is that ok?

Of course it's ok.

There is also a project https://github.com/42io/c_keyword_spotting which is based on kann. I am planing to add rnn there too in a couple weeks or so.

@shipleyxie
Copy link

Thanks a lot for the confirmation. Do you mind if I modify your code a little bit and add to the "examples/" directory? I will say something like "Code was written by Oleg Mazko (@mazko) and modified by Attractive Chaos". Is that ok?

Of course it's ok.

There is also a project https://github.com/42io/c_keyword_spotting which is based on kann. I am planing to add rnn there too in a couple weeks or so.

Have you finished this goal to applay GRU in Kann? @mazko

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

4 participants