diff --git a/cgt/__init__.py b/cgt/__init__.py
index 57683ec..2031a5d 100644
--- a/cgt/__init__.py
+++ b/cgt/__init__.py
@@ -1,7 +1,7 @@
 from .api import *
 from .display import print_tree, print_expr, print_text, as_dot
 from .compilation import function, numeric_eval, profiler
-from .core import grad, get_config, update_config, simplify, reset_config, Device, scoped_update_config, infer_shape
+from .core import grad, get_config, update_config, simplify, reset_config, Device, scoped_update_config, infer_shape, count_nodes
 from .ez import EasyCustomOp
 try: 
     import cycgt
diff --git a/cgt/api.py b/cgt/api.py
index 1e26802..82f1216 100644
--- a/cgt/api.py
+++ b/cgt/api.py
@@ -324,6 +324,11 @@ def getitem_nonfancy(arr, slis):
             step = (1 if sli.step is None else sli.step)
             if (isinstance(stop, int) and (stop < 0)):
                 stop = size(arr, ax) - stop
+            if isinstance(step, int):
+                assert step != 0
+                if step < 0:
+                    raise NotImplementedError("negative `step parameter is not implemented. use flip(x,0) instead of x[::-1]")
+
             out = core.Result(core.GetSli(ax), [out, start, stop, step])
         ax += 1
     if all(((x == 'k') for x in shapedesc)):
@@ -441,7 +446,7 @@ def repeat(x, repeats, axis):
     """
     Like numpy.repeat
     """
-    return core.Result(core.Repeat([axis]), [x, constant(repeats)])
+    return core.Result(core.Repeat([axis]), [x, core.as_node(repeats)])
 
 def reshape(x, shp):
     """
diff --git a/cgt/compilation.py b/cgt/compilation.py
index ea59e0b..0e2f3eb 100644
--- a/cgt/compilation.py
+++ b/cgt/compilation.py
@@ -1,6 +1,6 @@
 from . import core, utils
 import cgt
-import ctypes, os.path as osp, hashlib, numpy as np, sys, subprocess, string, os, time, traceback
+import ctypes, os.path as osp, hashlib, numpy as np, sys, subprocess, string, os, time, traceback, cPickle
 from collections import defaultdict, namedtuple
 from StringIO import StringIO
 import logging
@@ -38,10 +38,16 @@ def _function_listout(inputs, outputs, dbg = None, updates=None, givens=None):
 # Execution
 # ================================================================
 
+def python_only():
+    return not hasattr(cgt,"cycgt")
+
 def determine_devices(nodes_sorted, updatetarg2src):
     # Op definitions (available impls, inplace-ness, etc) define constraints
     # on possible devices for a node
 
+    if python_only():
+        return {node:Device() for node in nodes_sorted}
+
     # (1) Get available devices for nodes, determined by which impls are available and node types
     compile_info = get_compile_info()
 
@@ -264,7 +270,6 @@ def get_callable(op, input_types, devtype, prefer_python=False):
             else:
                 raise RuntimeError("Tried to put Op %s on the GPU but I only have a python impl :("%op)
 
-
 def get_native_callable(op, input_types, devtype):
     nci = op.get_native_compile_info(input_types, devtype)
     nci.op_str = str(op)
@@ -711,6 +716,9 @@ def call_and_print(cmd):
     ctypes.c_float : "float"
 }
 
+
+_struct_cache = {} # because creating ctypes.Structure class is slow for some reason
+
 def _build_closure(triples):
     if triples is None:
         return ctypes.c_void_p(0)
@@ -719,8 +727,13 @@ def _build_closure(triples):
     for (fieldname,fieldtype,val) in triples:
         vals.append(val)
         fields.append((fieldname,fieldtype))
-    class S(ctypes.Structure):
-        _fields_ = fields
+    try:
+        key = cPickle.dumps(fields)
+        S = _struct_cache[key]
+    except KeyError:
+        class S(ctypes.Structure):
+            _fields_ = fields
+        _struct_cache[key] = S
     closure = S(*vals)    
     return closure
 
diff --git a/cgt/core.py b/cgt/core.py
index 24f8f70..a498689 100644
--- a/cgt/core.py
+++ b/cgt/core.py
@@ -2666,6 +2666,9 @@ def topsorted(outputs):
     return out
 
 def count_nodes(outputs):
+    """
+    Given a list of output nodes, compute the number of ancestors
+    """
     if isinstance(outputs, Node): outputs = [outputs]
     return len(list(topsorted(outputs)))
 
diff --git a/cgt/optim.py b/cgt/optim.py
deleted file mode 100644
index ad51e7f..0000000
--- a/cgt/optim.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import numpy as np, logging
-from collections import deque
-import cgt
-import scipy.optimize as opt # maybe remove this dependency? only used for linesearch
-
-class InverseHessianPairs(object):
-    """
-    LBFGS (inverse) Hessian approximation based on rotating list of pairs (step, delta gradient)
-    that are assumed to approximately satisfy secant equation.
-    """
-    def __init__(self,max_num_pairs):
-        self.syrhos = deque([],max_num_pairs) #pylint: disable=E1121
-    def add(self,s,y):
-        rho = 1./y.dot(s)
-        self.syrhos.append((s,y,rho))
-        if rho < 0: print "WARNING: rho < 0"
-    def mvp(self,g):
-        """
-        Matrix-vector product        
-        Nocedal & Wright Algorithm 7.4
-        uses H0 = alpha*I, where alpha = <s,y>/<y,y>
-        """
-        assert len(self.syrhos) > 0
-
-        q = g.copy()
-        alphas = np.empty(len(self.syrhos))
-        for (i,(s,y,rho)) in reversed(list(enumerate(self.syrhos))):
-            alphas[i] = alpha = rho*s.dot(q)
-            q -= alpha*y
-
-        s,y,rho = self.syrhos[-1]
-        ydoty = y.dot(y)
-        sdoty = s.dot(y)
-        gamma = sdoty/ydoty
-
-        r = gamma*q
-
-        for (i,(s,y,rho)) in enumerate(self.syrhos):
-            beta = rho * y.dot(r)
-            r += s * (alphas[i] - beta)
-
-        return r
-
-    
-def lbfgs(f,fgrad,x0,maxiter=100,max_corr=25,grad_norm_tol=1e-9, ihp=None,ls_criteria="armijo"):
-    """
-    LBFGS algorithm as described by Nocedal & Wright in textbook Numerical Optimization
-    """
-    x = x0.copy()
-    yield x
-    if ihp is None: ihp = InverseHessianPairs(max_corr)
-    oldg = fgrad(x)
-    if ls_criteria=="armijo": fval = f(x)
-    p = -oldg/np.linalg.norm(oldg)
-
-    log = logging.getLogger("lbfgs")
-    iter_count = 0
-    while True:
-        # TODO compare line searches
-        g=None
-        if ls_criteria == "strong_wolfe":
-            alpha_star, _, _, fval, _, g = opt.line_search(f,fgrad,x,p,oldg)        
-        elif ls_criteria == "armijo":
-            import scipy.optimize.linesearch
-            alpha_star,_,fval=scipy.optimize.linesearch.line_search_armijo(f,x,p,oldg,fval)
-        else:
-            raise NotImplementedError
-
-        if alpha_star is None:
-            log.error("lbfgs line search failed!")
-            break
-        s = alpha_star * p
-        x += s
-        yield x
-
-        iter_count += 1
-        
-        if iter_count  >= maxiter:
-            break
-
-        if g is None: 
-            log.debug("line search didn't give us a gradient. calculating")
-            g = fgrad(x)
-
-        if np.linalg.norm(g) < grad_norm_tol:
-            break
-
-        y = g - oldg
-        ihp.add( s,y )
-        p = ihp.mvp(-g)
-        oldg = g
-
-        log.info("lbfgs iter %i %8.3e",iter_count, fval)
-
-
-def cg(f_Ax, b, cg_iters=10,callback=None,verbose=False,residual_tol=1e-10):
-    """
-    Demmel p 312
-    """
-    p = b.copy()
-    r = b.copy()
-    x = np.zeros_like(b)
-    rdotr = r.dot(r)
-
-    fmtstr =  "%10i %10.3g %10.3g"
-    titlestr =  "%10s %10s %10s"
-    if verbose: print titlestr%("iter","residual norm","soln norm")
-
-    for i in xrange(cg_iters):
-        if callback is not None:
-            callback(x)
-        if verbose: print fmtstr%(i, rdotr, np.linalg.norm(x))
-        z = f_Ax(p)
-        v = rdotr / p.dot(z)
-        x += v*p
-        r -= v*z
-        newrdotr = r.dot(r)
-        mu = newrdotr/rdotr
-        p = r + mu*p
-
-        rdotr=newrdotr
-        if rdotr < residual_tol:
-            break
-
-    if callback is not None:
-        callback(x)
-    if verbose: print fmtstr%(i+1, rdotr, np.linalg.norm(x)) #pylint: disable=W0631
-    return x
-
-def preconditioned_cg(f_Ax, f_Minvx, b, cg_iters=10,callback=None,verbose=False,residual_tol=1e-10):
-    """
-    Demmel p 318
-    """
-    x = np.zeros_like(b)
-    r = b.copy()
-    p = f_Minvx(b)
-    y = p
-    ydotr = y.dot(r)
-
-    fmtstr =  "%10i %10.3g %10.3g"
-    titlestr =  "%10s %10s %10s"
-    if verbose: print titlestr%("iter","residual norm","soln norm")
-
-    for i in xrange(cg_iters):
-        if callback is not None:
-            callback(x,f_Ax)
-        if verbose: print fmtstr%(i, ydotr, np.linalg.norm(x))
-        z = f_Ax(p)
-        v = ydotr / p.dot(z)
-        x += v*p
-        r -= v*z
-        y = f_Minvx(r)
-        newydotr = y.dot(r)
-        mu = newydotr/ydotr
-        p = y + mu*p
-
-        ydotr=newydotr
-
-        if ydotr < residual_tol:
-            break
-
-    if verbose: print fmtstr%(cg_iters, ydotr, np.linalg.norm(x))
-
-    return x
-
-
-
-def test_cg():
-    A = np.random.randn(5,5)    
-    A = A.T.dot(A)
-    b = np.random.randn(5)
-    x = cg(lambda x: A.dot(x), b, cg_iters=5,verbose=True) #pylint: disable=W0108
-    assert np.allclose(A.dot(x),b)
-
-
-    x = preconditioned_cg(lambda x: A.dot(x), lambda x: np.linalg.solve(A,x), b, cg_iters=5,verbose=True) #pylint: disable=W0108
-    assert np.allclose(A.dot(x),b)
-
-    x = preconditioned_cg(lambda x: A.dot(x), lambda x: x/np.diag(A), b, cg_iters=5,verbose=True) #pylint: disable=W0108
-    assert np.allclose(A.dot(x),b)
-
-
-
-def lanczos(f_Ax, b, k):
-    """
-    Runs Lanczos algorithm to generate a orthogonal basis for the Krylov subspace
-    b, Ab, A^2b, ...
-    as well as the upper hessenberg matrix T = Q^T A Q
-
-    from Demmel ch 6
-    """
-
-    assert k>1
-
-    alphas = []
-    betas = []
-    qs = []
-
-    q = b/np.linalg.norm(b)
-    beta = 0
-    qm = np.zeros_like(b)
-    for j in xrange(k):
-        qs.append(q)
-
-        z = f_Ax(q)
-
-        alpha = q.dot(z)
-        alphas.append(alpha)
-        z -= alpha*q + beta*qm
-
-        beta = np.linalg.norm(z)
-        betas.append(beta)
-
-        print "beta",beta
-        if beta < 1e-9:
-            print "lanczos: early after %i/%i dimensions"%(j+1,k)
-            break
-        else:
-            qm = q
-            q = z/beta
-
-
-    return np.array(qs,'float64').T, np.array(alphas,'float64'), np.array(betas[:-1],'float64')
-
-def lanczos2(f_Ax, b, k,residual_thresh=1e-9):
-    """
-    More numerically stable but less efficient version
-    """
-    b = b.astype('float64')
-    assert k>1
-    H = np.zeros((k,k))
-    qs = []
-
-    q = b/np.linalg.norm(b)
-    beta = 0
-
-    for j in xrange(k):
-        qs.append(q)
-
-        z = f_Ax(q.astype(cgt.floatX)).astype('float64')
-        for (i,q) in enumerate(qs):
-            H[j,i] = H[i,j] = h = q.dot(z)
-            z -= h*q
-
-        beta = np.linalg.norm(z)
-        if beta < residual_thresh:
-            print "lanczos2: stopping early after %i/%i dimensions residual %f < %f"%(j+1,k,beta,residual_thresh)
-            break
-        else:
-            q = z/beta
-            
-    return np.array(qs).T, H[:len(qs),:len(qs)]
-
-
-def make_tridiagonal(alphas,betas):
-    assert len(alphas)==len(betas)+1
-    N = alphas.size
-    out = np.zeros((N,N),cgt.floatX)
-    out.flat[0:N**2:N+1] = alphas
-    out.flat[1:N**2-N:N+1] = betas
-    out.flat[N:N**2-1:N+1] = betas
-    return out
-
-def tridiagonal_eigenvalues(alphas,betas):
-    T = make_tridiagonal(alphas,betas)
-    return np.linalg.eigvalsh(T)
-
-def test_lanczos():
-    np.set_printoptions(precision=4)
-
-    A = np.random.randn(5,5)    
-    A = A.T.dot(A)
-    b = np.random.randn(5)
-    f_Ax = lambda x:A.dot(x) #pylint: disable=W0108
-    Q,alphas,betas = lanczos(f_Ax,b,10)
-    H = make_tridiagonal(alphas,betas)
-    assert np.allclose( Q.T.dot(A).dot(Q), H)
-    assert np.allclose(Q.dot(H).dot(Q.T),A)
-    assert np.allclose(np.linalg.eigvalsh(H),np.linalg.eigvalsh(A))
-
-
-    Q,H1 = lanczos2(f_Ax, b, 10)
-    assert np.allclose(H,H1,atol=1e-6)
-
-
-    print "ritz eigvals:"
-    for i in xrange(1,6):
-        Qi = Q[:,:i]
-        Hi = Qi.T.dot(A).dot(Qi)
-        print np.linalg.eigvalsh(Hi)[::-1]
-    print "true eigvals:"
-    print np.linalg.eigvalsh(A)[::-1]
-
-    print "lanczos on ill-conditioned problem"
-    A = np.diag(10**np.arange(5))
-    Q,H1 = lanczos2(f_Ax, b, 10)
-    print np.linalg.eigvalsh(H1)
-
-    print "lanczos on ill-conditioned problem with noise"
-    def f_Ax_noisy(x):
-        return A.dot(x) + np.random.randn(x.size)*1e-3
-    Q,H1 = lanczos2(f_Ax_noisy, b, 10)
-    print np.linalg.eigvalsh(H1)
-
diff --git a/cgt/tests/test_affine.py b/cgt/tests/test_affine.py
index adabfd2..d0ccf88 100644
--- a/cgt/tests/test_affine.py
+++ b/cgt/tests/test_affine.py
@@ -67,10 +67,10 @@ def maybeprint(msg):
     for (g_cgt, g_true) in zip(grads_cgt, grads_true):
         np.testing.assert_allclose(g_cgt, g_true,rtol=rtol)
 
-    result_count = core.count_nodes(sy_result_simple)
-    grad_count = core.count_nodes(sy_grads_simple)
-    maybeprint("Result before: %i. after: %i"%(core.count_nodes([sy_result]), result_count))
-    maybeprint("Grad before: %i. after: %i"%(core.count_nodes(sy_grads), grad_count))
+    result_count = cgt.count_nodes(sy_result_simple)
+    grad_count = cgt.count_nodes(sy_grads_simple)
+    maybeprint("Result before: %i. after: %i"%(cgt.count_nodes([sy_result]), result_count))
+    maybeprint("Grad before: %i. after: %i"%(cgt.count_nodes(sy_grads), grad_count))
 
     PROB2RESULT[f.__name__] = {}    
     PROB2RESULT[f.__name__]["fn"] = result_count
@@ -241,6 +241,16 @@ def flip1(x, y):
         return (cgt.flip(x, [1])*y).sum()
 
 
+def negsli0(x,y):
+    return (x[::-1]*y).sum()
+
+def negsli1(x,y):
+    return (x[:, ::-1]*y).sum()
+
+def negsli01(x,y):
+    return (x[::-1, ::-1]*y).sum()
+
+
 def convlike(F_abcd, y_e_bcd, q_ae):
     a,b,c,d = F_abcd.shape
     F_a_bcd = F_abcd.reshape([a,b*c*d])
@@ -338,6 +348,10 @@ def check_affine_funcs(precision, backend):
     check_affine(flip0, M23, nr.randn(2,3))
     check_affine(flip1, M23, nr.randn(2,3))
 
+    # check_affine(negsli0, M23, nr.randn(2,3))
+    # check_affine(negsli1, M23, nr.randn(2,3))
+    # check_affine(negsli01, M23, nr.randn(2,3))
+
     # check_affine(rfft, M35)
     check_affine(convlike, T2357, nr.randn(11,3*5*7), nr.randn(2,11))
 
diff --git a/cgt/tests/test_examples.py b/cgt/tests/test_examples.py
index d0b240e..11031c6 100644
--- a/cgt/tests/test_examples.py
+++ b/cgt/tests/test_examples.py
@@ -12,6 +12,8 @@ def test_examples():
     yield run_example, "CGT_FLAGS=backend=python python %s/../../examples/demo_mnist.py --unittest"%thisdir
     yield run_example, "CGT_FLAGS=backend=native python %s/../../examples/demo_mnist.py --unittest"%thisdir
     yield run_example, "CGT_FLAGS=backend=native python %s/../../examples/demo_cifar.py --unittest"%thisdir
+    yield run_example, "cd %s/../../examples/ && CGT_FLAGS=backend=native python demo_char_rnn.py --unittest"%thisdir
+    yield run_example, "CGT_FLAGS=backend=native python %s/../../examples/demo_neural_turing_machine.py --unittest"%thisdir
     yield run_example, "python %s/../../examples/cgt_theano_feedforward_comparison.py --unittest"%thisdir
     runipycmd = "runipy %s/../../examples/tutorial.ipynb"%thisdir
     try:
diff --git a/cgt/utils.py b/cgt/utils.py
index 0af6ff7..bb38fc4 100644
--- a/cgt/utils.py
+++ b/cgt/utils.py
@@ -1,34 +1,40 @@
 import sys
 import numpy as np
 import hashlib
+import time
 
 # ================================================================
 # Utils
 # ================================================================
 
 class Color: #pylint: disable=W0232
-    GRAY=30,
-    RED=31,
-    GREEN=32,
-    YELLOW=33,
-    BLUE=34,
-    MAGENTA=35,
-    CYAN=36,
-    WHITE=37,
+    GRAY=30
+    RED=31
+    GREEN=32
+    YELLOW=33
+    BLUE=34
+    MAGENTA=35
+    CYAN=36
+    WHITE=37
     CRIMSON=38    
 
+
+def colorize(num, string, bold=False, highlight = False):
+    assert isinstance(num, int)
+    attr = []
+    if highlight: num += 10
+    attr.append(str(num))
+    if bold: attr.append('1')
+    return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
+
 def colorprint(colorcode, text, o=sys.stdout):
-    o.write("\x1b[%im"%colorcode)
-    o.write(text)
-    o.write("\x1b[0m")
+    o.write(colorize(colorcode, text))
 
 def warn(msg):
-    colorprint(Color.YELLOW, msg,o=sys.stderr)
-    sys.stderr.write("\n")
+    print colorize(Color.YELLOW, msg)
 
 def error(msg):
-    colorprint(Color.RED, msg,o=sys.stderr)
-    sys.stderr.write("\n")
+    print colorize(Color.RED, msg)
 
 def is_singleton(x):
     return isinstance(x, np.ndarray) and np.prod(x.shape)==1
@@ -61,3 +67,17 @@ def _hash_seq(args):
 def hash_seq1(*args):
     return _hash_seq(args)
 
+MESSAGE_DEPTH = 0
+class Message(object):
+    def __init__(self, msg):
+        self.msg = msg
+    def __enter__(self):
+        global MESSAGE_DEPTH #pylint: disable=W0603
+        print colorize(Color.MAGENTA, '\t'*MESSAGE_DEPTH + '=: ' + self.msg)
+        self.tstart = time.time()
+        MESSAGE_DEPTH += 1
+    def __exit__(self, etype, *args):
+        global MESSAGE_DEPTH #pylint: disable=W0603
+        MESSAGE_DEPTH -= 1
+        maybe_exc = "" if etype is None else " (with exception)"
+        print colorize(Color.MAGENTA, '\t'*MESSAGE_DEPTH + "done%s in %.3f seconds"%(maybe_exc, time.time() - self.tstart))
diff --git a/cgtrc.example b/cgtrc.example
index 813bd0c..99b8a37 100644
--- a/cgtrc.example
+++ b/cgtrc.example
@@ -1,11 +1,14 @@
+# see cgtrc_spec.ini for explanation
+
+debug = False
 precision = single
-backend = "python"
-backend_check_values = False
-cache_dir = "~/.cgt_cache"
+backend = python
+cache_dir = ~/.cgt_cache
 enable_inplace_opt = True
 enable_simplification = True
-force_python_impl = False
-disallow_python_impl = False
 parallel = False
-enable_cuda = False
-debug_cpp = False
\ No newline at end of file
+num_threads = default=4
+
+force_python_impl = False
+debug_cpp = False
+verbose = False
\ No newline at end of file
diff --git a/cgtrc_spec.ini b/cgtrc_spec.ini
index f67afdd..d7a4f67 100644
--- a/cgtrc_spec.ini
+++ b/cgtrc_spec.ini
@@ -1,15 +1,43 @@
 # DEVELOPERS: when you edit this file, please also edit cgtrc.example
-precision = string(default=single)
-backend = option("python","native",default="python")
-backend_check_values = boolean(default=False)
+
+# User options
+# ----------------------
+
+# At the cost of some overhead,
+# store information in the computation graph that helps with debugging
+debug = boolean(default=False)
+
+# single or double precision:
+precision = string(default=single) 
+
+# backend=python means using a pure python module to execute the graph, and using python implementations of ops whenever they exist
+# backend=native means using the compiled execution graph interpreter, and using the native (c++) implementation of ops
+backend = option("python","native",default="python") # "native" means using compiled implementations and 
+
+# Where to put generated files
 cache_dir = string(default="~/.cgt_cache")
+
+# Enable in-place optimizations.
 enable_inplace_opt = boolean(default=True)
+
+# Enable simplifications of the graph, e.g. arithmetic simplifications like x*1=x
 enable_simplification = boolean(default=True)
-force_python_impl = boolean(default=False)
-disallow_python_impl = boolean(default=False)
+
+# Use parallel execution graph interpreter
 parallel = boolean(default=False)
+
+# Number of 
 num_threads = integer(default=4)
-enable_cuda = boolean(default=False)
+
+# Developer Options
+# -----------------
+
+# Force native backend to use python
+force_python_impl = boolean(default=False)
+
+# Compile C++ files with debug flags
 debug_cpp = boolean(default=False) # use debug flags when compiling c++
-debug = boolean(default=False)
+
+# Print lots of diagnostic information
+# (we'll break this down at some point)
 verbose = boolean(default=False)
\ No newline at end of file
diff --git a/doc/index.rst b/doc/index.rst
index 594eebc..719347b 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -5,20 +5,16 @@ Computation Graph Toolkit
 Computation Graph Toolkit (CGT) is a library for evaluation and differentiation of functions of multidimensional arrays.
 
 
-.. CAUTION::
-
-    **WORK IN PROGRESS!** This software is not ready to release! The ends are loose, the edges are VERY rough, and most of the functionality required to make this software useful are not yet implemented.
-
 What Does It Do?
 ================
 
 The core features are as follows:
 
 - Automatic differentiation of functions involving multidimensional arrays, using computation graphs
-- Compile fast implementations of array computations that can be run in parallel on multiple CPUs and GPUs. [ONLY SINGLE-THREADED CPU USAGE IS CURRENTLY SUPPORTED.]
-- A compilation process that simplifies your function through arithmetic identities and in-place optimizations, which readily handles extremely large (1M operation) graphs.
+- Compile fast implementations of array computations that can be run in parallel on multiple CPUs and GPUs. (GPU and multi-GPU support is currently in work-in-progress)
+- A compilation process that simplifies your function through arithmetic identities and in-place optimizations, which readily handles extremely large graphs.
 - Supports both forward and backward derivative propagation, as well as higher-order differentiation.
-- CGT can export standalone C++ code for of your functions. [PROSPECTIVE]
+- CGT can export standalone C++ code for of your functions.
 
 CGT is motivated by large-scale machine learning and AI problems, however, the core library will focus on the more abstract problems of evaluating and differentiating mathematical expressions. This will ensure that CGT is flexible enough to handle use-cases that are completely unanticipated by the software authors. Libraries for numerical optimization and convenient construction of neural networks will be built on top of CGT’s core functionality.
 
@@ -41,8 +37,8 @@ Installation
 **Dependencies**:
 
 - NumPy
-- Cython (Optional)
-- CUDA (Optional)
+- Cython (optional, required for native backend)
+- CUDA Tookit (optional, required for GPU implementation of Ops)
 
 
 Option 1: Python (NumPy) only
@@ -56,7 +52,7 @@ Just update your ``PYTHONPATH`` as follows::
 Option 2: Build C++ backend
 ---------------------------
 
-If you want to use the C++ backend, which has better performance and in the future will allow multithreading, then Cython is required, and the installation procedure is as follows.
+If you want to use the C++ backend, which has better performance and enables multithreading, then Cython is required, and the installation procedure is as follows.
 First, ``cd`` into the source directory. Then, type::
 
     mkdir build
@@ -89,7 +85,7 @@ Running unit tests
 
 You can run our suite of unit tests to verify your installation. In the source directory::
 
-    nosetests
+    nosetests -v
 
 Note that you'll have to install the ``nose`` python module.
 
@@ -98,23 +94,153 @@ Tutorial
 
 .. notebook:: ../examples/tutorial.ipynb
 
+nn: Neural Network Module
+=========================
+
+The ``nn`` module (``import cgt.nn``) provides a light wrapper around CGT's core API that allows the user to concisely build up complicated neural network models.
+Below we will show how to build up a convolutional neural network, and then how to parallelize it using the multi-threaded interpreter.
+A complete code listing can be found in ``examples/cgt_theano_feedforward_comparison.py``.
+
+
+A Simple ConvNet
+----------------
+
+The following code constructs a simple convolution neural network (sized for MNIST images, what else?),
+where the loss function is the negative log-likelihood of labels.
+(A full listing is provided in the source directory, see ``examples/cgt_theano_feedforward_comparison.py``)
+
+.. code-block:: python
+
+    # X: a symbolic variable representing a batch of input images,
+    # with shape (batchsize, nchannels, nrows, ncols)
+    X = cgt.tensor4("X", fixed_shape=(None,1,28,28)) 
+    # We provide the fixed_shape argument so 
+    # CGT can infer the shape of downstream variables
+    # y: a symbolic variable representing the labels, which are integers
+    y = cgt.vector("y", dtype='i8')
+    # SpatialConvolution(...) is a constructor call, which builds the weights 
+    # (filter) and the biases for the convolutional layer
+    # rectify(...) is just a function call that maps x -> x*(x>0)
+    conv1 = nn.rectify(
+        nn.SpatialConvolution(1, 32, kernelshape=(3,3), pad=(0,0), 
+        weight_init=nn.IIDGaussian(std=.1))(X))
+    # Apply max pooling function
+    pool1 = nn.max_pool_2d(conv1, kernelshape=(3,3), stride=(2,2))
+    # Another convolutional layer
+    conv2 = nn.rectify(
+        nn.SpatialConvolution(32, 32, kernelshape=(3,3), pad=(0,0), 
+        weight_init=nn.IIDGaussian(std=.1))(pool1))
+    pool2 = nn.max_pool_2d(conv2, kernelshape=(3,3), stride=(2,2))
+    # Now we flatten the last output image
+    d0,d1,d2,d3 = pool2.shape
+    flatlayer = pool2.reshape([d0,d1*d2*d3])
+    # CGT can infer the shape of variables
+    nfeats = cgt.infer_shape(flatlayer)[1]
+    # One final fully-connected layer, and then a log-softmax
+    logprobs = nn.logsoftmax(nn.Affine(nfeats, 10)(flatlayer))
+    neglogliks = -logprobs[cgt.arange(X.shape[0]), y]
+    loss = neglogliks.mean()
+
+
+
+Now that we've built up an expression for the loss, we can build an expression for the gradient
+
+.. code-block:: python
+
+    # walk through the graph and find all parameters 
+    # i.e., variables constructed with cgt.shared(...)
+    params = nn.get_parameters(loss)
+    gparams = cgt.grad(loss, params)
+
+Finally, we can build up a function that updates the parameters:
+
+.. code-block:: python
+
+    updates = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)]
+    updater = cgt.function([X, y, stepsize], loss, updates=updates)
+
+
+Adding parallelization
+----------------------
+
+CGT is capable of executing computations in parallel.
+A feedforward network does not offer much opportunity for parallelization, but we can easily transform it to use data parallelism.
+
+First let's build a ``Module``, which is a parameterized function.
+
+.. code-block:: python
+
+    m = nn.Module([X,y], [loss])
+
+The two arguments to the `Module` constructor are a list of inputs and a list of outputs, respectively.
+
+Now, we can split the data along the zeroth apply the module `m` separately to each piece.
+
+.. code-block:: python
+
+    split_loss = 0
+    for start in xrange(0, batch_size, batch_size//4):
+        sli = slice(start, start+batch_size//4)
+        split_loss += m([X[sli], y[sli]])[0]
+    split_loss /= 4
+    params = nn.get_parameters(split_loss)
+    gparams = cgt.grad(split_loss, params)
+    updates2 = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)]
+    updater =  cgt.function([X,y, stepsize], split_loss, updates=updates2)
+
+Configuration
+=============
+
+CGT sets a number of variables that you should be aware of.
+The file, ``cgtrc_spec.ini``, included below, provides a listing of the configuration variables.
+
+.. literalinclude:: ../cgtrc_spec.ini
+    :lines: 5-
+
+You can modify these values through the file ``~/.cgtrc`` or via the command line.
+See ``cgtrc.example`` in the source directory for a template.
 
-Tour of the Internals
-=====================
 
 Debugging
 =========
 
+Let's suppose you have compiled a ``function``, but it is failing at runtime or returning invalid results, and you're trying to figure out what might be wrong.
+Here are some steps you can take:
+
+1. Use the python backend. That is, modify ``.cgtrc`` or set ``CGT_FLAGS=backend=python`` at the command line. The python implementation of operations use numpy, which may catch certain errors (e.g. shape mismatches) that our C++ implementations miss.
+2. Set the configuration variable ``debug=True`` (with ``.cgtrc`` or ``CGT_FLAGS``.) That will enable several pieces of functionality that store information in the graph when you are building it, so useful information can be printed when an exception is reached.
+3. If there is a shape error, you can sometimes find it by using the ``infer_shape`` function. When constructing your variables, specify the known components of their shapes, e.g. Then you can add assertions with ``infer_shape`` (We plan to add more functionality soon that will catch shape errors at graph construction time.) Here's an example:
+   
+    .. code-block:: python
+
+        x = cgt.matrix(fixed_shape=(None,5))
+        y = cgt.matrix(fixed_shape=(5,4))
+        z = x.dot(y)
+        assert cgt.infer_shape(z)[1] == 4
+
+You can also take this a step further and compute numerical values associated with the nodes in the graph. Just replace ``x`` and ``y`` above with constants, and then use ``cgt.simplify`` to compute the value of ``z``.
 
+4. *Disable optimizations.* Set ``enable_simplification=False`` and ``enable_inplace_opt=False`` in ``.cgtrc``. Maybe there is a bug in cgt's optimization. If so, please :ref:`report it <BugsHelp>`.
 
 Cookbook
 ========
 
-See ``examples`` directory.
+See ``examples`` directory:
 
-Links and Further Reading
-=========================
+- ``demo_mnist.py``: shows how to build up a fully-connected or convolutional neural network  using a low-level API.
+- ``demo_cifar.py``: train a convolutional neural net on CIFAR dataset using ``nn``'s Torch-like API.
+- ``demo_char_rnn.py``: based on Andrej Karpathy's char-rnn code, but all in one file for building the deep LSTM or GRU model and generating text.
+- ``demo_neural_turing_machine.py``: implementation of the `Neural Turing Machine <http://arxiv.org/abs/1410.5401>`_, with feedforward controller.
+
+More examples are coming soon!
+
+.. _BugsHelp:
+
+Reporting Bugs and Getting Help
+===============================
 
+You can post your queries on the `cgt-users discussion group <https://groups.google.com/forum/#!forum/cgt-users>`_.
+If you want to participate in the development of CGT, post on the `cgt-devel discussion group <https://groups.google.com/forum/#!forum/cgt-devel>`_.
 
 
 .. _whynottheano:
@@ -125,11 +251,11 @@ Why not Build on Theano?
 CGT is heavily based on Theano, and we (the authors of CGT) think that Theano is a beautiful and highly innovative piece of software.
 However, several limitation of Theano (in its current state) motivated us to consider creating a new library:
 
-- **Problem**: Optimization and compilation of the graphs is very slow. For this region, Theano becomes  inconvenient when working with large recurrent models. To use these models, one has to use the Scan operator, which is usually less convenient than constructing a graph with the unrolled computation. |br| **CGT solution**: (1) the main graph simplification process in CGT involves a single pass through the graph that applies several different types of replacement simultaneously (common subexpression elimination, constant propagation, arithmetic identities like ``x*1=x``.) In-place optimizations are performed in a second phase that also involves a single pass through the graph. Together, these phases take negligible time. Furthermore, we use a different graph data-structure (similar to SSA representations used by compilers) which allows for much cleaner simplification code. In Theano, the C++/CUDA compilation itself takes significant time, because Theano compiles a whole Python module (written in C++) for each function, which includes Python.h and numpy/arrayobject.h. On the other hand, CGT compiles a small C++ file with minimal header dependencies, taking a small fraction of a second, and the relevant function is later retrieved with ``dlopen`` and ``dlsym``.
+- **Problem**: Optimization and compilation of the graphs is very slow. For this reason, Theano becomes  inconvenient when working with large recurrent models. To use these models, one has to use the Scan operator, which is usually less convenient than constructing a graph with the unrolled computation. |br| **CGT solution**: (1) the main graph simplification process in CGT involves a single pass through the graph that applies several different types of replacement simultaneously (common subexpression elimination, constant propagation, arithmetic identities like ``x*1=x``.) In-place optimizations are performed in a second phase that also involves a single pass through the graph. Together, these phases take negligible time. Furthermore, we use a different graph data-structure (similar to SSA representations used by compilers) which allows for much cleaner simplification code. In Theano, the C++/CUDA compilation itself takes significant time, because Theano compiles a whole Python module (written in C++) for each function, which includes Python.h and numpy/arrayobject.h. On the other hand, CGT compiles a small C++ file with minimal header dependencies, taking a small fraction of a second, and the relevant function is later retrieved with ``dlopen`` and ``dlsym``.
 - **Problem**: Theano can't straightforwardly be used to perform different operations in parallel, because of Python's GIL. |br| **CGT solution**: we create a representation of the computation called the execution graph, which can be executed in C++ independently of Python, and encodes all of the information necessary for concurrent execution of operations.
-- **Problem**: When using GPUs, the user often obtains poor performance unless he is careful to set up the graph in a way that the operations can be executed on the GPU. |br| **CGT solution**: we give the user finer grained control over which operation is performed on which device.
+- **Problem**: When using GPUs, the user often obtains poor performance unless the user is careful to set up the graph in a way that the operations can be executed on the GPU. |br| **CGT solution**: we give the user finer grained control over which operation is performed on which device.
 - **Problem**: Automatic upcasting rules (e.g. int * float = double) require the user to add casts many casting operations. |br| **CGT solution**: we globally choose either single or double (or quad) precision, using ``cgt.set_precision(...)``
-- It is difficult to debug certain bugs problems such as shape mismatches. Furthermore, Theano tensors have a `broadcastable` attribute that must be set to allow broadcasting and is point of confusion for many users. |br| **CGT solution**: we require explicit broadcasting using the ``broadcast(...)`` function. This requires slightly more verbosity but serves to eliminate many common errors and usually allows us to determine all of the shapes of intermediate variables in terms of the shapes of the inputs, which allows many shape errors to be caught at graph construction time.
+- It is difficult to debug certain problems such as shape mismatches. Furthermore, Theano tensors have a `broadcastable` attribute that must be set to allow broadcasting and is point of confusion for many users. |br| **CGT solution**: we require explicit broadcasting using the ``broadcast(...)`` function. This requires slightly more verbosity but serves to eliminate many common errors and usually allows us to determine all of the shapes of intermediate variables in terms of the shapes of the inputs, which allows many shape errors to be caught at graph construction time.
 
 Some of issues could be addressed within Theano's existing codebase, however, we believe that by breaking compatibility and starting from afresh, it will be possible to resolve them more cleanly.
 
diff --git a/examples/bench/cgt_gru.py b/examples/bench/cgt_gru.py
new file mode 100644
index 0000000..b800293
--- /dev/null
+++ b/examples/bench/cgt_gru.py
@@ -0,0 +1,47 @@
+import cgt
+from gru import GRUCell
+import time
+from cgt.utils import Message
+import numpy as np
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--horizon",type=int)
+    args = parser.parse_args()
+    horizon = args.horizon
+    assert horizon is not None    
+    size=128
+    batchsize=64
+    cell = GRUCell([size],size)
+    X = cgt.tensor3()
+    init = cgt.matrix()
+
+    prev_h = init
+    for i in xrange(horizon):
+        prev_h = cell(X[i], prev_h)
+    loss = prev_h.sum()
+
+    with Message("compiling"):
+        f = cgt.function([X, init],cgt.grad(loss, cell.params()))
+    with Message("running"):
+        xval = np.zeros((horizon,batchsize,size),cgt.floatX)
+        initval = np.zeros((batchsize, size), cgt.floatX)
+        for i in xrange(100): 
+            f(xval, initval)
+
+
+# # No speedup -- why?
+# with Message("split loss. compiling"):
+#     from cgt import nn
+#     m = cgt.nn.Module([X, init], [loss])
+#     split_loss = 0
+#     X1 = cgt.tensor3()
+#     init1 = cgt.matrix()
+#     for start in xrange(0, batchsize, batchsize//4):
+#         sli = slice(start, start+batchsize//4)
+#         split_loss += m([X1[:, sli], init1[sli]])[0]
+#     f = cgt.function([X1, init1],cgt.grad(split_loss, cell.params()))
+# with Message("running"):
+#     for i in xrange(100): 
+#         f(xval,initval)
diff --git a/examples/broken/gru.py b/examples/bench/gru.py
similarity index 92%
rename from examples/broken/gru.py
rename to examples/bench/gru.py
index e2faa7d..4ab979e 100644
--- a/examples/broken/gru.py
+++ b/examples/bench/gru.py
@@ -40,13 +40,14 @@ def __init__(self,input_sizes,mem_size,name_prefix=""):
 
     def __call__(self,M,*inputs):
         assert len(inputs) == len(self.Wizs)
-        summands = [Xi.dot(Wiz) for (Xi,Wiz) in zip(inputs,self.Wizs)] + [M.dot(self.Wmz),self.bz]
+        n = M.shape[0]
+        summands = [Xi.dot(Wiz) for (Xi,Wiz) in zip(inputs,self.Wizs)] + [M.dot(self.Wmz),cgt.repeat(self.bz,n, axis=0)]
         z = cgt.sigmoid(cgt.add_multi(summands))
 
-        summands = [Xi.dot(Wir) for (Xi,Wir) in zip(inputs,self.Wirs)] + [M.dot(self.Wmr),self.br]
+        summands = [Xi.dot(Wir) for (Xi,Wir) in zip(inputs,self.Wirs)] + [M.dot(self.Wmr),cgt.repeat(self.br,n, axis=0)]
         r = cgt.sigmoid(cgt.add_multi(summands))
 
-        summands = [Xi.dot(Wim) for (Xi,Wim) in zip(inputs,self.Wims)] + [(r*M).dot(self.Wmm),self.bm]
+        summands = [Xi.dot(Wim) for (Xi,Wim) in zip(inputs,self.Wims)] + [(r*M).dot(self.Wmm),cgt.repeat(self.bm,n, axis=0)]
         Mtarg = cgt.tanh(cgt.add_multi(summands)) #pylint: disable=E1111
 
         Mnew = (1-z)*M + z*Mtarg
diff --git a/examples/broken/seq_model.py b/examples/bench/seq_model.py
similarity index 93%
rename from examples/broken/seq_model.py
rename to examples/bench/seq_model.py
index f5339af..4469c59 100644
--- a/examples/broken/seq_model.py
+++ b/examples/bench/seq_model.py
@@ -3,7 +3,7 @@
 from time import time
 
 elapsed = []
-horizons = 2**np.arange(14)
+horizons = 2**np.arange(2, 10)
 
 for horizon in horizons:
     print "HORIZON",horizon
@@ -40,6 +40,6 @@
     elapsed.append(time()-tstart)
 
 import matplotlib.pyplot as plt
-plt.plot(horizons,elapsed)
+plt.plot(horizons,elapsed,'x-')
 plt.show()
 
diff --git a/examples/bench/theano_gru.py b/examples/bench/theano_gru.py
new file mode 100644
index 0000000..678e065
--- /dev/null
+++ b/examples/bench/theano_gru.py
@@ -0,0 +1,94 @@
+import theano, theano.tensor as TT
+from cgt.utils import Message
+import time
+import numpy as np
+
+def normc(x):
+    assert x.ndim == 2
+    return x/norms(x,0)[None,:]
+def randnf(*shp):
+    return np.random.randn(*shp).astype(theano.config.floatX)
+def norms(x,ax):
+    return np.sqrt(np.square(x).sum(axis=ax))
+
+class GRUCell(object):
+    """
+    Gated Recurrent Unit. E.g., see
+    Chung, Junyoung, et al. "Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling." arXiv preprint arXiv:1412.3555 (2014).
+    """    
+    def __init__(self,input_sizes,mem_size,name_prefix=""):
+
+        Wiz_vals = [normc(randnf(input_size,mem_size)) for input_size in input_sizes]
+        self.Wizs = [theano.shared(Wiz_val,name=name_prefix+"Wiz") for Wiz_val in Wiz_vals]
+        Wmz_val = normc(randnf(mem_size,mem_size))
+        self.Wmz = theano.shared(Wmz_val,name=name_prefix+"Wmz")
+        bz = np.zeros((1,mem_size),theano.config.floatX)
+        self.bz = theano.shared(bz,name=name_prefix+"bz")
+        self.bz.type.broadcastable = (True,False)        
+
+        Wir_vals = [normc(randnf(input_size,mem_size)) for input_size in input_sizes]
+        self.Wirs = [theano.shared(Wir_val,name=name_prefix+"Wir") for Wir_val in Wir_vals]
+        Wmr_val = normc(randnf(mem_size,mem_size))
+        self.Wmr = theano.shared(Wmr_val,name=name_prefix+"Wmr")
+        br = np.zeros((1,mem_size),theano.config.floatX)
+        self.br = theano.shared(br,name=name_prefix+"br")
+        self.br.type.broadcastable = (True,False)
+
+        Wim_vals = [normc(randnf(input_size,mem_size)) for input_size in input_sizes]
+        self.Wims = [theano.shared(Wim_val,name=name_prefix+"Wim") for Wim_val in Wim_vals]
+        Wmm_val = normc(np.eye(mem_size,dtype=theano.config.floatX))
+        self.Wmm = theano.shared(Wmm_val,name=name_prefix+"Wmm")
+        bm = np.zeros((1,mem_size),theano.config.floatX)
+        self.bm = theano.shared(bm,name=name_prefix+"bm")
+        self.bm.type.broadcastable = (True,False)
+
+    def __call__(self,M,*inputs):
+        assert len(inputs) == len(self.Wizs)
+        summands = [Xi.dot(Wiz) for (Xi,Wiz) in zip(inputs,self.Wizs)] + [M.dot(self.Wmz),self.bz]
+        z = TT.nnet.sigmoid(TT.add(*summands))
+
+        summands = [Xi.dot(Wir) for (Xi,Wir) in zip(inputs,self.Wirs)] + [M.dot(self.Wmr),self.br]
+        r = TT.nnet.sigmoid(TT.add(*summands))
+
+        summands = [Xi.dot(Wim) for (Xi,Wim) in zip(inputs,self.Wims)] + [(r*M).dot(self.Wmm),self.bm]
+        Mtarg = TT.tanh(TT.add(*summands)) #pylint: disable=E1111
+
+        Mnew = (1-z)*M + z*Mtarg
+        return Mnew
+
+    def params(self):
+        out = []
+        out.extend(self.Wizs)
+        out.append(self.Wmz)
+        out.append(self.bz)        
+        out.extend(self.Wirs)
+        out.append(self.Wmr)
+        out.append(self.br)        
+        out.extend(self.Wims)
+        out.append(self.Wmm)
+        out.append(self.bm)        
+        return out
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--horizon",type=int)
+    args = parser.parse_args()
+    horizon =args.horizon
+    assert horizon is not None
+    size=128
+    batchsize=64
+    cell = GRUCell([size],size)
+    X = TT.tensor3()
+    init = TT.zeros((batchsize, size),theano.config.floatX)
+
+    prev_h = init
+    for i in xrange(horizon):
+        prev_h = cell(X[i], prev_h)
+
+    with Message("compiling"):
+        f = theano.function([X],theano.grad(prev_h.sum(), cell.params()))
+    with Message("running"):
+        x = np.zeros((horizon,batchsize,size),theano.config.floatX)
+        for i in xrange(100): 
+            f(x)
\ No newline at end of file
diff --git a/examples/broken/seq_model2.py b/examples/broken/seq_model2.py
deleted file mode 100644
index ebfbfa0..0000000
--- a/examples/broken/seq_model2.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import gru,cgt, numpy as np
-import sys
-from time import time
-
-horizon = 3000
-batch_size = 6
-dim_x = 16
-mem_size = 10
-
-X_tnk = cgt.tensor3("X")
-
-cell = gru.GRUCell([dim_x], mem_size)
-
-# Minit_nk = cgt.zeros((X_tnk.shape[0], X_tnk.shape[1]),cgt.floatX)
-# M = Minit_nk
-
-Min = cgt.matrix("M_in")
-Xin = cgt.matrix("X_in")
-Mout = cell(Min, Xin)
-
-
-cellop = cgt.CallableComposition([Min, Xin], [Mout])
-
-
-M=Min
-for t in xrange(horizon):
-    M, = cellop(M, X_tnk[t])
-
-# cgt.print_tree(M)
-print "simplifying..."
-M_simp = cgt.simplify(M)
-print "done"
-# cgt.print_tree(M_simp)
-print "before:",cgt.count_nodes(M)
-print "after:",cgt.count_nodes(M_simp)
-
-# M = cgt.simplify(M)
diff --git a/examples/cgt_theano_feedforward_comparison.py b/examples/cgt_theano_feedforward_comparison.py
index 8dd64c2..459cf4c 100644
--- a/examples/cgt_theano_feedforward_comparison.py
+++ b/examples/cgt_theano_feedforward_comparison.py
@@ -147,7 +147,7 @@ def make_updater_fc_theano():
         return theano.function([X,y, stepsize], loss, updates=updates, allow_input_downcast=True)
 
 
-    if not have_theano and not args.unittest:
+    if have_theano and not args.unittest:
         updater_fc_theano = make_updater_fc_theano()
         print "Theano Fully-Connected Network"
         run_sgd_epochs(Xtrain, ytrain, updater_fc_theano)
@@ -247,7 +247,7 @@ def make_updater_convnet_theano():
         updates = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)]
         return theano.function([X,y, stepsize], loss, updates=updates, allow_input_downcast=True)
 
-    if not have_theano and not args.unittest:
+    if False:#have_theano and not args.unittest:
 
         updater_convnet_theano = make_updater_convnet_theano()
         print "Theano Convnet"
diff --git a/examples/demo_char_rnn.py b/examples/demo_char_rnn.py
index 663d073..f9c88cb 100644
--- a/examples/demo_char_rnn.py
+++ b/examples/demo_char_rnn.py
@@ -133,15 +133,18 @@ def make_loss_and_grad_and_step(arch, size_input, size_output, size_mem, size_ba
 
     flatgrad = flatcat(gradloss)
 
-    f_loss_and_grad = cgt.function([x_tnk, targ_tnk] + init_hiddens, [loss, flatgrad] + final_hiddens)
+    with utils.Message("compiling loss+grad"):
+        f_loss_and_grad = cgt.function([x_tnk, targ_tnk] + init_hiddens, [loss, flatgrad] + final_hiddens)
     f_loss = cgt.function([x_tnk, targ_tnk] + init_hiddens, loss)
 
     assert len(init_hiddens) == len(final_hiddens)
 
     x_nk = cgt.matrix('x')
     outputs = network([x_nk] + init_hiddens)
+
     f_step = cgt.function([x_nk]+init_hiddens, outputs)
 
+    # print "node count", cgt.count_nodes(flatgrad)
     return network, f_loss, f_loss_and_grad, f_step
 
 
@@ -257,6 +260,7 @@ def main():
     parser.add_argument("--arch",choices=["lstm","gru"],default="lstm")
     parser.add_argument("--grad_check",action="store_true")
     parser.add_argument("--profile",action="store_true")
+    parser.add_argument("--unittest",action="store_true")
 
     args = parser.parse_args()
 
@@ -311,10 +315,11 @@ def f(thnew):
             rmsprop_update(grad, optim_state)
             pc.set_value_flat(optim_state.theta)
             losses.append(loss)
+            if args.unittest: return
         print "%.3f s/batch. avg loss = %.3f"%((time()-tstart)/len(losses), np.mean(losses))
-        optim_state.step_size *= .95 #pylint: disable=E1101
+        optim_state.step_size *= .98 #pylint: disable=E1101
 
-        sample(f_step, initialize_hiddens(1), char2ind =loader.char2ind, n_steps=300, temp=1.0, seed_text = "")
+        sample(f_step, initialize_hiddens(1), char2ind=loader.char2ind, n_steps=300, temp=1.0, seed_text = "")
 
     if args.profile: profiler.print_stats()
 
diff --git a/examples/demo_neural_turing_machine.py b/examples/demo_neural_turing_machine.py
index 39110f6..a4d2339 100644
--- a/examples/demo_neural_turing_machine.py
+++ b/examples/demo_neural_turing_machine.py
@@ -38,7 +38,7 @@
 from cgt.core import infer_shape
 from example_utils import fmt_row
 from param_collection import ParamCollection
-
+import time
 
 # Subscript indicate dimensions of array, and what each dimension indexes over
 NTMOpts = namedtuple("NTMOpts",[
@@ -241,6 +241,8 @@ def make_funcs(opt, ntm, total_time, loss_timesteps):
     f_loss = cgt.function([x_tbk, y_tbp], lossCE)
     f_loss_and_grad = cgt.function([x_tbk, y_tbp], [lossCE, loss01, flatgrad])
 
+    print "number of nodes in computation graph:", core.count_nodes([lossCE, loss01, flatgrad])
+
     return f_loss, f_loss_and_grad, params
 
 def round01(x):
@@ -357,10 +359,11 @@ def main():
         
 
 
+    tstart = time.time()
     ntm = make_ntm(opt)
-
     task = CopyTask(opt.b, seq_length, opt.p)
     f_loss, f_loss_and_grad, params = make_funcs(opt, ntm, task.total_time(), task.loss_timesteps())
+    print "graph construction and compilation took %g seconds"%(time.time()-tstart)
 
     pc = ParamCollection(params)
     pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(),)))