diff --git a/bidmach b/bidmach index db39c8be..9bc38a2c 100755 --- a/bidmach +++ b/bidmach @@ -2,7 +2,7 @@ # export JAVA_HOME="" # Set here if not set in environment # export CUDA_PATH="" # Set here if not set in environment -MEMSIZE="-Xmx14G" +MEMSIZE="-Xmx40G" export JAVA_OPTS="${MEMSIZE} -Xms128M -Dfile.encoding=UTF-8" # Set as much memory as possible BIDMACH_ROOT="${BASH_SOURCE[0]}" if [ ! `uname` = "Darwin" ]; then diff --git a/scripts/daniel_smf_netflix_adagrad.ssc b/scripts/daniel_smf_netflix_adagrad.ssc new file mode 100755 index 00000000..44a0d391 --- /dev/null +++ b/scripts/daniel_smf_netflix_adagrad.ssc @@ -0,0 +1,73 @@ +:silent +import BIDMach.models.SMF + +/** + * Test SMF code on netflix data. This will use default ADAGrad. In general, + * RMSEs of roughly 0.83 to 0.85 are "good". + */ + +// Same code as in the MHTest+ADAGrad script. +setseed(0) +val dir = "/data/netflix/" +val a = loadSMat(dir+"newtrain.smat.lz4") +val ta = loadSMat(dir+"newtest.smat.lz4") +val d = 256 +val lrates = row(0.0001, 0.001, 0.01, 0.1) +val langs = row(0.0, 0.05, 0.5) +var bestrmse = 10.0; +var prettystring = "lrate lang. rmse\n" + +for (i <- 0 until lrates.length) { + for (k <- 0 until langs.length) { + val (nn,opts) = SMF.learner1(a, d) + + // Common parameters with the MHTest+ADAGrad version. + opts.batchSize = 1000 + opts.npasses = 2 + opts.nesterov = null + opts.langevin = langs(k).v + opts.momentum = 1f + + opts.uiter = 5 + opts.urate = 0.05f + opts.lrate = lrates(i).v + val lambda = 4f + opts.lambdau = lambda + opts.regumean = lambda + opts.lambdam = lambda / 500000 * 20 + opts.regmmean = opts.lambdam + opts.evalStep = 31 + opts.doUsers = false + opts.what + nn.train + + val model = nn.model.asInstanceOf[SMF] + val xa = (ta != 0) + val (mm, mopts) = SMF.predictor1(model, a, xa) + mopts.batchSize = 10000 + mopts.uiter = 5 + mopts.urate = opts.urate + mopts.aopts = null + mm.predict + + val pa = SMat(mm.preds(1)); + println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) + val diff = ta.contents - pa.contents + val rmse = sqrt((diff ddot diff) / diff.length) + println("\nrmse = %f" format rmse.v) + min(pa.contents,5,pa.contents) + max(pa.contents,1,pa.contents) + val diff2 = ta.contents - pa.contents + val rmse2 = sqrt((diff2 ddot diff2) / diff2.length) + println("rmse (w/clipping) = %f\n" format rmse2.v) + + if (rmse2.v < bestrmse) { + bestrmse = rmse2.v + } + prettystring += "%1.5f %1.3f %1.4f\n" format (lrates(i).v,langs(k).v,rmse2.v) + } +} + +println("\nBest RMSE: "+bestrmse+ "\n") +println(prettystring) +sys.exit diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc new file mode 100755 index 00000000..c87d281e --- /dev/null +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -0,0 +1,118 @@ +:silent +import BIDMach.models.SMF + +/** + * Test SMF code on netflix data. This will use OUR MHTest updater, which I put + * in as a new updater (SMF.learner2) to make this script more concise. Some + * notes on the netflix dataset: + * + * size(a) = (17770,480189) + * a.nnz = 90430138 + * min=0, max=5 + * + * (a == 1).nnz = 4156151 + * (a == 2).nnz = 9120198 + * (a == 3).nnz = 25928920 + * (a == 4).nnz = 30375037 + * (a == 5).nnz = 20849832 + * mean (of nonzeros) = 3.6042476 + * sqrt((diff ddot diff) / diff.nn) = 1.0852 // Train RMSE using mean predictor + * + * (ta == 1).nnz = 461839 + * (ta == 2).nnz = 1011882 + * (ta == 3).nnz = 2882327 + * (ta == 4).nnz = 3375921 + * (ta == 5).nnz = 2318400 + * mean (of nonzeros) = 3.6046705 + * sqrt((diff ddot diff) / diff.nn) = 1.0851 // Test RMSE using mean predictor + * + * BTW: (a *@ ta).nnz = 0, which shows that they are completely distinct. + */ + +// Same code as in the ADAGrad-only script. +setseed(0) +val dir = "/data/netflix/" +val a = loadSMat(dir+"newtrain.smat.lz4") +val ta = loadSMat(dir+"newtest.smat.lz4") +val d = 256 +val lrates = row(0.01) +val langs = sqrt(2f * lrates) // NEW! MALA so it's sqrt(2 * lr). +var bestrmse = 10.0; +var prettystring = "lrate lang. arate rmse\n" + +for (i <- 0 until lrates.length) { + for (k <- 0 until 1) { + val (nn,opts) = SMF.learner2(a, d) + + // Common parameters with the ADAGrad version. + opts.batchSize = 1000 + opts.npasses = 1 + opts.nesterov = null + opts.langevin = langs(k).v + opts.momentum = null + + opts.uiter = 5 + opts.urate = 0.05f + opts.lrate = lrates(i).v + val lambda = 4f + opts.lambdau = lambda + opts.regumean = lambda + opts.lambdam = lambda / 500000 * 20 + opts.regmmean = opts.lambdam + opts.evalStep = 31 + opts.doUsers = false + + // Now some stuff specific for the MHTest+ADAGrad. + opts.smf = true + opts.saveAcceptRate = true + opts.acceptRateDir = "tmp/" + opts.N = a.nnz + opts.temp = a.nnz / 1000 + opts.Nknown = true + opts.n2lsigma = 1.0f + opts.nn2l = 4000 + opts.sigmaProposer = 0.01f + opts.continueDespiteFull = false + opts.verboseMH = true + opts.collectData = false + opts.collectDataDir = "tmp/" + opts.exitTheta = false + opts.initThetaHere = true + opts.burnIn = -1 + opts.matrixOfScores = true + opts.what + nn.train + + val model = nn.model.asInstanceOf[SMF] + val xa = (ta != 0) + val (mm, mopts) = SMF.predictor1(model, a, xa) + mopts.batchSize = 10000 + mopts.uiter = 5 + mopts.urate = opts.urate + mopts.aopts = null + mm.predict + + val pa = SMat(mm.preds(1)); + println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) + val diff = ta.contents - pa.contents + val rmse = sqrt((diff ddot diff) / diff.length) + println("\nrmse = %f" format rmse.v) + min(pa.contents,5,pa.contents) + max(pa.contents,1,pa.contents) + val diff2 = ta.contents - pa.contents + val rmse2 = sqrt((diff2 ddot diff2) / diff2.length) + println("rmse (w/clipping) = %f\n" format rmse2.v) + + val accepts = loadMat(opts.acceptRateDir+"arate_%1.4f_%1.3f.mat.lz4" format (lrates(i).v,langs(k).v)) + val arate = accepts.nnz / accepts.length.toFloat + + if (rmse2.v < bestrmse) { + bestrmse = rmse2.v + } + prettystring += "%1.5f %1.3f %1.4f %1.4f\n" format (lrates(i).v,langs(k).v,arate,rmse2.v) + } +} + +println("\nBest RMSE: "+bestrmse+ "\n") +println(prettystring) +sys.exit diff --git a/src/main/scala/BIDMach/Learner.scala b/src/main/scala/BIDMach/Learner.scala index 5648164d..e6b07e6a 100755 --- a/src/main/scala/BIDMach/Learner.scala +++ b/src/main/scala/BIDMach/Learner.scala @@ -137,7 +137,12 @@ case class Learner( if (mixins != null) mixins map (_ compute(mats, here)); if (updater != null) updater.update(ipass, here, gprogress); } - val scores = model.evalbatchg(mats, ipass, here); + + // Daniel: I needed to change the following line to the one after it: + // val scores = model.evalbatchg(mats, ipass, here); + val scores = mean(model.evalbatchg(mats, ipass, here)).v; + // in orer for the MH test to work with different-sized minibatches. + if (datasink != null) datasink.put; reslist.append(scores.newcopy) samplist.append(here) @@ -879,7 +884,6 @@ object Learner { def scores2FMat(reslist:ListBuffer[FMat]):FMat = { if (reslist.length == 0) return zeros(0, 0) - val out = FMat(reslist(0).nrows, reslist.length) var i = 0; while (i < reslist.length) { diff --git a/src/main/scala/BIDMach/models/SFA.scala b/src/main/scala/BIDMach/models/SFA.scala index ff7b65a0..055fbec4 100755 --- a/src/main/scala/BIDMach/models/SFA.scala +++ b/src/main/scala/BIDMach/models/SFA.scala @@ -216,6 +216,10 @@ class SFA(override val opts:SFA.Opts = new SFA.Options) extends FactorModel(opts Minv <-- inv(50f/nfeats*FMat(mm *^ mm) + opts.lambdau * diagM); } + /** + * The evalfun normally called during training. Returns -RMSE on training + * data minibatch (sdata). + */ def evalfun(sdata:Mat, user:Mat, ipass:Int, pos:Long):FMat = { val preds = DDS(mm, user, sdata) + (iavg + avg); if (ogmats != null) { @@ -230,6 +234,10 @@ class SFA(override val opts:SFA.Opts = new SFA.Options) extends FactorModel(opts -sqrt(row(vv/sdata.nnz)) } + /** + * The evalfun normally called during testing and predicting. Returns -RMSE + * on training data minibatch (sdata). + */ override def evalfun(sdata:Mat, user:Mat, preds:Mat, ipass:Int, pos:Long):FMat = { val spreds = DDS(mm, user, sdata) + (iavg + avg); val dc = sdata.contents; @@ -267,25 +275,6 @@ object SFA { } class Options extends Opts {} - def learner(mat0:Mat, d:Int) = { - class xopts extends Learner.Options with SFA.Opts with MatSource.Opts with Grad.Opts - val opts = new xopts - opts.dim = d - opts.putBack = -1 - opts.npasses = 4 - opts.lrate = 0.1 - opts.initUval = 0f; - opts.batchSize = math.min(100000, mat0.ncols/30 + 1) - val nn = new Learner( - new MatSource(Array(mat0:Mat), opts), - new SFA(opts), - null, - new Grad(opts), - null, - opts) - (nn, opts) - } - def learnerX(mat0:Mat, d:Int) = { class xopts extends Learner.Options with SFA.Opts with MatSource.Opts with ADAGrad.Opts val opts = new xopts @@ -306,25 +295,6 @@ object SFA { (nn, opts) } - def learner(mat0:Mat, user0:Mat, d:Int) = { - class xopts extends Learner.Options with SFA.Opts with MatSource.Opts with Grad.Opts - val opts = new xopts - opts.dim = d - opts.putBack = 1 - opts.npasses = 4 - opts.lrate = 0.1; - opts.initUval = 0f; - opts.batchSize = math.min(100000, mat0.ncols/30 + 1) - val nn = new Learner( - new MatSource(Array(mat0, user0), opts), - new SFA(opts), - null, - new Grad(opts), - null, - opts) - (nn, opts) - } - def learnerX(mat0:Mat, user0:Mat, d:Int) = { class xopts extends Learner.Options with SFA.Opts with MatSource.Opts with ADAGrad.Opts val opts = new xopts @@ -345,28 +315,9 @@ object SFA { (nn, opts) } - def learnerY(mat0:Mat, user0:Mat, d:Int) = { - class xopts extends Learner.Options with SFA.Opts with MatSource.Opts with ADAGrad.Opts - val opts = new xopts - opts.dim = d - opts.putBack = 1 - opts.npasses = 4 - opts.lrate = 0.1; - opts.initUval = 0f; - opts.batchSize = math.min(100000, mat0.ncols/30 + 1) - val nn = new Learner( - new MatSource(Array(mat0, user0), opts), - new SFA(opts), - null, - new ADAGrad(opts), - null, - opts) - (nn, opts) - } - - class PredOpts extends Learner.Options with SFA.Opts with MatSource.Opts with MatSink.Opts + class PredOpts extends Learner.Options with SFA.Opts with MatSource.Opts with MatSink.Opts - def predictor(model0:Model, mat1:Mat, preds:Mat) = { + def predictor(model0:Model, mat1:Mat, preds:Mat) = { val model = model0.asInstanceOf[SFA] val nopts = new PredOpts; nopts.batchSize = math.min(10000, mat1.ncols/30 + 1) diff --git a/src/main/scala/BIDMach/models/SMF.scala b/src/main/scala/BIDMach/models/SMF.scala index 4d8657e5..7d9a38d6 100755 --- a/src/main/scala/BIDMach/models/SMF.scala +++ b/src/main/scala/BIDMach/models/SMF.scala @@ -5,6 +5,7 @@ import BIDMat.MatFunctions._ import BIDMat.SciFunctions._ import BIDMat.Solvers._ import BIDMach.datasources._ +import BIDMach.datasinks._ import BIDMach.updaters._ import BIDMach.Learner @@ -71,35 +72,48 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts var epsilon = 0f; var aopts:ADAGrad.Opts = null; + // Daniel: doing this to set MB sizes in MHTest code. + var numNonzerosMB:Int = -1 override def init() = { + // Get dimensions; for Netflix, size(mats(0)) = (17770,batchSize). mats = datasource.next; - datasource.reset; - nfeats = mats(0).nrows; - val batchSize = mats(0).ncols; + datasource.reset; + nfeats = mats(0).nrows; + val batchSize = mats(0).ncols; + numNonzerosMB = mats(0).nnz val d = opts.dim; + if (refresh) { - mm = normrnd(0,0.01f,d,nfeats); - mm = convertMat(mm); - avg = mm.zeros(1,1) - iavg = mm.zeros(nfeats,1); - itemsum = mm.zeros(nfeats, 1); - itemcount = mm.zeros(nfeats, 1); - setmodelmats(Array(mm, iavg, avg)); + // Randomly drawing mm, iavg, and avg (the three respective model + // matrices). Note that nfeats is the number of items (e.g. movies). + println("Inside refresh") + mm = normrnd(0,0.01f,d,nfeats); + mm = convertMat(mm); + avg = mm.zeros(1,1) + iavg = mm.zeros(nfeats,1); + itemsum = mm.zeros(nfeats, 1); + itemcount = mm.zeros(nfeats, 1); + setmodelmats(Array(mm, iavg, avg)); } + + // Handle brief logic with GPUs. Careful with aliasing as well!! useGPU = opts.useGPU && Mat.hasCUDA > 0; - if (useGPU || useDouble) { - gmats = new Array[Mat](mats.length); - } else { - gmats = mats; - } - - modelmats(0) = convertMat(modelmats(0)); - modelmats(1) = convertMat(modelmats(1)); - modelmats(2) = convertMat(modelmats(2)); - mm = modelmats(0); + if (useGPU || useDouble) { + gmats = new Array[Mat](mats.length); + } else { + gmats = mats; + } + modelmats(0) = convertMat(modelmats(0)); + modelmats(1) = convertMat(modelmats(1)); + modelmats(2) = convertMat(modelmats(2)); + mm = modelmats(0); iavg = modelmats(1); avg = modelmats(2); + + // Here's some confusing stuff. Seems to be "small" stuff about constants. + // uscale, an internal ADAGrad parameter but we use it (!!!). + // cscale, an internal ADAGrad parameter but we ignore it. lamu = mm.ones(d, 1) ∘ opts.lambdau if (opts.doUsers) lamu(0) = opts.regumean; slm = mm.ones(1,1) ∘ (opts.lambdam * batchSize); @@ -109,12 +123,18 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts cscale = mm.ones(d, 1); cscale(0,0) = 0.0001f; if (opts.doUsers) mm(0,?) = 1f + + // The updatemats is the same length as the model matrices. updatemats = new Array[Mat](3); updatemats(2) = mm.zeros(1,1); + + // Set this to null to avoid the internal ADAGrad updater making updates. if (opts.aopts != null) initADAGrad(d, nfeats); - vexp = convertMat(row(0.5f)); + vexp = convertMat(row(0.5f)); // External ADAGrad parameter, OK here. } - + + + /** An internal ADAGrad updater. Ignore this for our current experiments. */ def initADAGrad(d:Int, m:Int) = { aopts = opts.asInstanceOf[ADAGrad.Opts] firststep = -1f; @@ -127,141 +147,241 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts waitsteps = aopts.waitsteps; epsilon = aopts.epsilon; } - + + + /** + * Performs some number of passes over the minibatch to update the user + * matrix. Try to understand how the user matrix gets updated ... note that + * putBack = -1 by default. I think this is the user matrix update, so we're + * holding the item matrix fixed (it's actually the model matrix, but the same + * point holds) while updating the user by stochastic gradient descent. + * + * We subtract biases, so predictions can be done with DDS(mm,user,sdata) + * *without* re-adding biases. Also, we *do* clear out user here. is this + * because John said we can't really save the entire *full* user matrix (the + * one with size (dim,480189))? ucounts sums up the number of nonzeros in + * each columns of sdata0, then uci is something else on it. b might make + * sense in some way, because the derivative term later is mm*(sdata-preds) + * and the sdata-preds is supposed to be close to each other. + * + * We then update the user matrix several times based on current predictions. + * Actually, this update makes sense because the normal SGD update for x_u + * (user vectors) is x_u minus the following term: + * + * alpha*(data-prediction)*item_vector + lambda*user_vector + * + * and that's what we have here! QUESTION, though, uscale is an integrated + * ADAGrad value. Do we want it here? I'm also not sure why we need du to + * have uscale and uci there ... + * + * NOTE: Upon further inspection, it seems that `user` starts out as a matrix + * of all zeros. So the user.clear with putBack<0 is un-necessary as it is + * already cleared. I suppose in theory we should have some putBack mechanism + * (that way, the user matrix value is stored from prior iterations) but John + * said there's little reason to do that. Also, even with putBack=1, I can't + * get the user matrix's values carried over. Hmmm ... + * + * @param sdata0 Training data minibatch of size (nitems, batchSize). + * @param user Second matrix for computing predictions, of size (dim, batchSize). + */ def uupdate(sdata0:Mat, user:Mat, ipass:Int, pos:Long):Unit = { - if (firststep <= 0) firststep = pos.toFloat; - val step = (pos + firststep)/firststep; - val texp = if (opts.asInstanceOf[Grad.Opts].texp.asInstanceOf[AnyRef] != null) { - opts.asInstanceOf[Grad.Opts].texp.dv - } else { - opts.asInstanceOf[Grad.Opts].pexp.dv - } - uscale.set(opts.urate * math.pow(ipass+1, - texp).toFloat) + if (firststep <= 0) firststep = pos.toFloat; + val step = (pos + firststep)/firststep; + val texp = if (opts.asInstanceOf[Grad.Opts].texp.asInstanceOf[AnyRef] != null) { + opts.asInstanceOf[Grad.Opts].texp.dv + } else { + opts.asInstanceOf[Grad.Opts].pexp.dv + } + + uscale.set(opts.urate * math.pow(ipass+1, - texp).toFloat) val sdata = sdata0 - (iavg + avg); - if (putBack < 0) { - user.clear - } - val b = mm * sdata; - val ucounts = sum(sdata0 != 0f); - val uci = (ucounts + 1f) ^ (- vexp); - for (i <- 0 until opts.uiter) { - val preds = DDS(mm, user, sdata); - val deriv = b - mm * preds - (user ∘ lamu); - val du = (deriv ∘ uscale ∘ uci); - if (opts.lsgd >= 0) { - val dpreds = DDS(mm, du, sdata); - accept(sdata, user, du, preds, dpreds, uscale, lamu, false); - } else { - user ~ user + du; - } - - if (opts.traceConverge) { - println("step %d, loss %f" format (i, ((norm(sdata.contents - preds.contents) ^ 2f) + (sum(user dot (user ∘ lamu)))).dv/sdata.nnz)); - } - } + if (putBack < 0) { + user.clear + } + val b = mm * sdata; + val ucounts = sum(sdata0 != 0f); + val uci = (ucounts + 1f) ^ (- vexp); + + for (i <- 0 until opts.uiter) { + val preds = DDS(mm, user, sdata); + val deriv = b - mm * preds - (user ∘ lamu); + val du = (deriv ∘ uscale ∘ uci); + user ~ user + du; + if (opts.traceConverge) { + println("step %d, loss %f" format (i, ((norm(sdata.contents - preds.contents) ^ 2f) + (sum(user dot (user ∘ lamu)))).dv/sdata.nnz)); + } + } } - + + + /** + * Computes updates to the updatemats. Note again that we subtract (iavg+avg) + * from the sdata, so that predictions are done with DDS(mm,user,sdata), and + * the differences (for gradient update later) are stored. This is for + * updating the item matrix, so we hold the user matrix fixed (it's updated in + * uupdate) and compute updates for the item matrix (and the bias terms, + * actually). Also, this might be why we don't use a user bias term. Note that + * we call predictions once here, since mm and user are fixed for this method; + * ideally, some other updater will use the updatemats we compute (i.e. the + * gradients) to update the item matrix. The item matrix, if it wasn't clear, + * is modelmats(0). + * + * Note that there's some extra work with ipass < 1, I think to get reasonable + * initialization values for our bias terms. Here, avg is the average rating + * across the entire nonzeros of sdata0 (hence, our global bias), and iavg is + * some (scaled) estimate of how much we should boost each individal item. The + * iavg should be smaller since the avg already scales stuff to roughly 3.6. + * + * During predictions, this method is NOT called, hence the biases aren't + * updated. + * + * @param sdata0 Training data minibatch of size (nitems, batchSize). + * @param user Second matrix for computing predictions, of size (dim, + * batchSize). The matrix has the same values as the user matrix updated + * from the most recent uupdate method call. + */ def mupdate(sdata0:Mat, user:Mat, ipass:Int, pos:Long):Unit = { val sdata = sdata0 - (iavg + avg); // values to be accumulated val preds = DDS(mm, user, sdata); - val diffs = sdata + 2f; + val diffs = sdata + 2f; // I THINK 2f is only for avoiding aliasing, but why not 0f? diffs.contents ~ sdata.contents - preds.contents; + if (ipass < 1) { - itemsum ~ itemsum + sum(sdata0, 2); - itemcount ~ itemcount + sum(sdata0 != 0f, 2); - avg ~ sum(itemsum) / sum(itemcount); - iavg ~ ((itemsum + avg) / (itemcount + 1)) - avg; + itemsum ~ itemsum + sum(sdata0, 2); // sum horizontally + itemcount ~ itemcount + sum(sdata0 != 0f, 2); // count #nonzeros horizontally + avg ~ sum(itemsum) / sum(itemcount); + iavg ~ ((itemsum + avg) / (itemcount + 1)) - avg; } + + // Compute gradient updates for the biases, and set wuser=user unless we're weighing. val icomp = sdata0 != 0f val icount = sum(sdata0 != 0f, 2); updatemats(1) = (sum(diffs,2) - iavg*mlm) / (icount + 1f); // per-item term estimator updatemats(2) ~ sum(diffs.contents) / (diffs.contents.length + 1f); val wuser = if (opts.weightByUser) { - val iwt = 100f / max(sum(sdata != 0f), 100f); + val iwt = 100f / max(sum(sdata != 0f), 100f); user ∘ iwt; } else { user; } if (firststep <= 0) firststep = pos.toFloat; + + // I get it! This derivative is virtually the same as what we had with the + // user update, except user and mm swap locations, which is expected. if (opts.lsgd >= 0 || opts.aopts == null) { - updatemats(0) = (wuser *^ diffs - (mm ∘ slm)) / ((icount + 1).t ^ vexp); // simple derivative - if (opts.lsgd >= 0) { - val step = (pos + firststep)/firststep; - uscale.set((lrate.dv * math.pow(step, - texp.dv)).toFloat); - val dm = updatemats(0) ∘ uscale ∘ cscale; - val dpreds = DDS(dm, user, sdata); - accept(sdata, mm, dm, preds, dpreds, uscale, slm, true); - } + updatemats(0) = (wuser *^ diffs - (mm ∘ slm)) / ((icount + 1).t ^ vexp); // simple derivative } else { - if (texp.asInstanceOf[AnyRef] != null) { - val step = (pos + firststep)/firststep; - ADAGrad.multUpdate(wuser, diffs, modelmats(0), sumsq, null, lrate, texp, vexp, epsilon, step, waitsteps); - } else { - ADAGrad.multUpdate(wuser, diffs, modelmats(0), sumsq, null, lrate, pexp, vexp, epsilon, ipass + 1, waitsteps); - } + if (texp.asInstanceOf[AnyRef] != null) { + val step = (pos + firststep)/firststep; + ADAGrad.multUpdate(wuser, diffs, modelmats(0), sumsq, null, lrate, texp, vexp, epsilon, step, waitsteps); + } else { + ADAGrad.multUpdate(wuser, diffs, modelmats(0), sumsq, null, lrate, pexp, vexp, epsilon, ipass + 1, waitsteps); + } } if (opts.doUsers) mm(0,?) = 1f; } - def accept(sdata:Mat, mmod:Mat, du:Mat, preds:Mat, dpreds:Mat, scale:Mat, lambda:Mat, flip:Boolean) = { - // println("sdata " + FMat(sdata.contents)(0->5,0).t) - val diff1 = preds + 0f; - diff1.contents ~ sdata.contents - preds.contents; -// println("sdata %d %s" format (if (flip) 1 else 0, FMat(sdata.contents)(0->5,0).t.toString)); -// println("preds %d %s" format (if (flip) 1 else 0, FMat(preds.contents)(0->5,0).t.toString)); -// println("diff %d %s" format (if (flip) 1 else 0, FMat(diff1.contents)(0->5,0).t.toString)); -// println("sdata "+FMat(sdata.contents)(0->5,0).t.toString); - val diff2 = diff1 + 0f; - diff2.contents ~ diff1.contents - dpreds.contents; - diff1.contents ~ diff1.contents ∘ diff1.contents; - diff2.contents ~ diff2.contents ∘ diff2.contents; - val rmmod = mmod + 1f; - normrnd(0, opts.lsgd, rmmod); - val mmod2 = mmod + du + rmmod ∘ scale; - val loss1 = (if (flip) sum(diff1,2).t else sum(diff1)) + (mmod dot (mmod ∘ lambda)); - val loss2 = (if (flip) sum(diff2,2).t else sum(diff2)) + (mmod2 dot (mmod2 ∘ lambda)); - - val accprob = erfc((loss2 - loss1) /scale); - val rsel = accprob + 0f; - rand(rsel); - val selector = rsel < accprob; - mmod ~ (mmod2 ∘ selector) + (mmod ∘ (1f - selector)); - if (opts.traceConverge) { - println("accepted %d %f %f %f" format (if (flip) 1 else 0, mean(selector).dv, mean(loss1).dv, mean(loss2).dv)); - } + + /** + * The evalfun normally called during training. Returns -RMSE on training data + * minibatch (sdata0). It has an extra option to return a matrix of scores, + * which will be useful for minibatch MH test updaters. We need a 1/(2*sigma^2) + * if we're assuming a Gaussian error distribution. + * + * @param sdata Training data minibatch of size (nitems, batchSize). + * @param user Second matrix for computing predictions, of size (dim, + * batchSize). The values here are based on the values computed in the most + * recent uupdate call. + */ + def evalfun(sdata:Mat, user:Mat, ipass:Int, pos:Long):FMat = { + val preds = DDS(mm, user, sdata) + (iavg + avg); + if (ogmats != null) { + ogmats(0) = user; + if (ogmats.length > 1) { + ogmats(1) = preds; + } + } + val dc = sdata.contents + val pc = preds.contents + val diff = DMat(dc - pc); + if (opts.matrixOfScores) { + // TODO Temporary but should be OK for now (b/c we almost never increment MB). + // The FMat(diff *@ diff) will make a vector, hence the broadcasting. + // Also, I was getting a handful of *really* negative scores where log + // p(.) went to -10000 or so. To prevent that, set a threshold at -15. + //println(sqrt((diff ddot diff)/diff.length)) // Use for debugging and sanity checks. + val sigma_sq = variance(diff).dv + val scores = -ln(sqrt(2*math.Pi*sigma_sq)).v - (1.0f/(2*sigma_sq)).v * FMat(diff *@ diff) + max(scores, -15f, scores) + scores + } else { + val vv = diff ddot diff; + -sqrt(row(vv/sdata.nnz)) + } + } + + + /** + * The evalfun normally called during TESTING (i.e. PREDICTION). Returns -RMSE + * on the TRAINING minibatch in `sdata`. We should also store predictions in + * `ogmats(1)`, which is what we can access externally via `preds(1)`. Thus, + * it's predicting based on both training and testing. + * + * @param sdata Training data minibatch of size (nitems, batchSize). + * @param user Second matrix for computing predictions, size (dim, batchSize). + * @param preds Matrix indicating the non-zero TESTING data points. + */ + override def evalfun(sdata:Mat, user:Mat, preds:Mat, ipass:Int, pos:Long):FMat = { + val spreds = DDS(mm, user, sdata) + (iavg + avg); + val xpreds = DDS(mm, user, preds) + (iavg + avg); + val dc = sdata.contents; + val pc = spreds.contents; + val vv = (dc - pc) ddot (dc - pc); + + println("mean values (train/t.pred): \t%1.4f\t%1.4f" format (FMat(mean(dc)).v,FMat(mean(pc)).v)) + println("std. values (train/t.pred): \t%1.4f\t%1.4f" format (FMat(sqrt(variance(dc))).v,FMat(sqrt(variance(pc))).v)) + println("max. values (train/t.pred): \t%1.4f\t%1.4f" format (FMat(maxi(dc)).v,FMat(maxi(pc)).v)) + println("min. values (train/t.pred): \t%1.4f\t%1.4f" format (FMat(mini(dc)).v,FMat(mini(pc)).v)) + + if (ogmats != null) { + ogmats(0) = user; + if (ogmats.length > 1) { + ogmats(1) = xpreds; + } + } + -sqrt(row(vv/sdata.nnz)) } - def evalfun(sdata0:Mat, user:Mat, ipass:Int, pos:Long):FMat = { - val sdata = sdata0 - (iavg + avg); - val preds = DDS(mm, user, sdata); - val dc = sdata.contents - val pc = preds.contents - val diff = dc - pc; - val vv = diff ddot diff; - -sqrt(row(vv/sdata.nnz)) + + /** So I can set the MHTest container size appropriately. */ + def getNonzeros():Int = { + return numNonzerosMB } + + } + object SMF { trait Opts extends FactorModel.Opts { var ueps = 1e-10f var uconvg = 1e-3f - var miter = 5 var lambdau = 5f var lambdam = 5f var regumean = 0f var regmmean = 0f var urate = 0.1f - var lsgd = 0.1f var traceConverge = false var doUsers = true var weightByUser = false var aopts:ADAGrad.Opts = null; var minv = 1f; var maxv = 5f; - + var matrixOfScores = false; + var lsgd = 0f; } + class Options extends Opts {} def learner(mat0:Mat, d:Int) = { @@ -282,67 +402,78 @@ object SMF { opts) (nn, opts) } - - def learnerX(mat0:Mat, d:Int) = { + + /** + * Learner with single (training data) matrix as datasource, and an + * **EXTERNAL** ADAGrad Opts. We will benchmark this with the learner using + * an external MHTest. No internal ADAGrad updater. + */ + def learner1(mat0:Mat, d:Int) = { class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with ADAGrad.Opts val opts = new xopts - opts.dim = d + opts.dim = d opts.putBack = -1 - opts.npasses = 4 - opts.lrate = 0.1; - opts.initUval = 0f; - opts.batchSize = math.min(100000, mat0.ncols/30 + 1); - opts.aopts = opts; - val nn = new Learner( - new MatSource(Array(mat0:Mat), opts), - new SMF(opts), - null, - null, - null, - opts); + opts.npasses = 4 + opts.lrate = 0.1 + opts.initUval = 0f; + opts.batchSize = math.min(100000, mat0.ncols/30 + 1) + opts.aopts = null + val nn = new Learner( + new MatSource(Array(mat0:Mat), opts), + new SMF(opts), + null, + new ADAGrad(opts), + null, + opts) (nn, opts) } - - def learner(mat0:Mat, user0:Mat, d:Int) = { - class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with Grad.Opts + + /** + * Learner with single (training data) matrix as datasource, and using our + * MHTest updater. Use this for running experiments to benchmark with default + * ADAGrad. For our experiments, we should NOT be using opts.aopts, which is + * the internal ADAGrad updater. So that should be null ... + */ + def learner2(mat0:Mat, d:Int) = { + class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with ADAGrad.Opts with MHTest.Opts val opts = new xopts - opts.dim = d - opts.putBack = 1 - opts.npasses = 4 - opts.lrate = 0.1; - opts.initUval = 0f; + opts.dim = d + opts.putBack = -1 + opts.npasses = 4 + opts.lrate = 0.1 + opts.initUval = 0f; opts.batchSize = math.min(100000, mat0.ncols/30 + 1) + opts.aopts = null val nn = new Learner( - new MatSource(Array(mat0, user0), opts), - new SMF(opts), - null, - new Grad(opts), - null, - opts) + new MatSource(Array(mat0:Mat), opts), + new SMF(opts), + null, + new MHTest(opts), + null, + opts) (nn, opts) - } - - def learnerX(mat0:Mat, user0:Mat, d:Int) = { - class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with ADAGrad.Opts + } + + def learner(mat0:Mat, user0:Mat, d:Int) = { + class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with Grad.Opts val opts = new xopts opts.dim = d opts.putBack = 1 opts.npasses = 4 opts.lrate = 0.1; opts.initUval = 0f; - opts.batchSize = math.min(100000, mat0.ncols/30 + 1); - opts.aopts = opts; + opts.batchSize = math.min(100000, mat0.ncols/30 + 1) val nn = new Learner( new MatSource(Array(mat0, user0), opts), new SMF(opts), null, - null, + new Grad(opts), null, opts) (nn, opts) } - def predictor(model0:Model, mat1:Mat, preds:Mat) = { + def predictor(model0:Model, mat1:Mat, preds:Mat) = { class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with Grad.Opts val model = model0.asInstanceOf[SMF] val nopts = new xopts; @@ -354,7 +485,6 @@ object SMF { val mopts = model.opts.asInstanceOf[SMF.Opts]; nopts.dim = mopts.dim; nopts.uconvg = mopts.uconvg; - nopts.miter = mopts.miter; nopts.lambdau = mopts.lambdau; nopts.lambdam = mopts.lambdam; nopts.regumean = mopts.regumean; @@ -369,6 +499,46 @@ object SMF { nopts) (nn, nopts) } -} - + + /** A class for one of the SMF predictors. */ + class PredOpts extends Learner.Options with SMF.Opts with MatSource.Opts with MatSink.Opts with Grad.Opts with ADAGrad.Opts + /** + * A predictor which will store the predictions in the predictor model + * matrices. It forms an empty matrix to be populated by the `user` matrices, + * which turns into the second factor matrix. It mirrors an SFA predictor code + * which also forms this empty matrix into the matrix datasource, with the + * only difference being the lack of an Minv option for `newmod`. + * + * @param mat1 The TRAINING DATA matrix. NOT THE TESTING DATA!!! NOT THE + * TESTING DATA!!! + * @param preds The non-zeros of the TESTING data (not training). + */ + def predictor1(model0:Model, mat1:Mat, preds:Mat) = { + val model = model0.asInstanceOf[SMF] + val nopts = new PredOpts; + nopts.batchSize = math.min(10000, mat1.ncols/30 + 1) + nopts.putBack = -1 + nopts.initUval = 0f // Daniel: for consistency with training update. + val newmod = new SMF(nopts); + newmod.refresh = false + newmod.copyFrom(model); + val mopts = model.opts.asInstanceOf[SMF.Opts]; + nopts.dim = mopts.dim; + nopts.uconvg = mopts.uconvg; + nopts.lambdau = mopts.lambdau; + nopts.lambdam = mopts.lambdam; + nopts.regumean = mopts.regumean; + nopts.doUsers = mopts.doUsers; + nopts.weightByUser = mopts.weightByUser; + nopts.nmats = 2; + val nn = new Learner( + new MatSource(Array(mat1, zeros(mopts.dim, mat1.ncols), preds), nopts), + newmod, + null, + null, + new MatSink(nopts), + nopts) + (nn, nopts) + } +} diff --git a/src/main/scala/BIDMach/updaters/ADAGrad.scala b/src/main/scala/BIDMach/updaters/ADAGrad.scala index bd27ea93..8658b1ee 100755 --- a/src/main/scala/BIDMach/updaters/ADAGrad.scala +++ b/src/main/scala/BIDMach/updaters/ADAGrad.scala @@ -12,6 +12,9 @@ import scala.concurrent.ExecutionContext.Implicits.global class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Updater { + // Three other sets of matrices (sumSq, momentum, and randmat) all have (if + // initialized) same length and sizes as the modelmats and updatemats. Is the + // `mu` here a momentum term as well? var firstStep = 0f var modelmats:Array[Mat] = null var updatemats:Array[Mat] = null @@ -19,15 +22,32 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda var stepn:Mat = null var mask:Mat = null var momentum:Array[Mat] = null; - var ve:Mat = null - var pe:Mat = null - var te:Mat = null + var ve:Mat = null // opts.vexp, an ADAGrad parameter (see BIDMach wiki) + var pe:Mat = null // Similar to opts.texp? Some exponent like the others? + var te:Mat = null // opts.texp, an ADAGrad parameter (see BIDMach wiki) var lrate:Mat = null + var tscale:Mat = null // Daniel: I had to add this to make it accessible to MHTest. var mu:Mat = null var one:Mat = null var randmat:Array[Mat] = null + + /** + * Initialize ADAGrad model. Note the conditions required to create momentum + * and randmats. We have opts.momentum but also momentum mats, similar thing + * with nesterov. I would *guess* that opts.momentum is that hyper-parameter, + * while the momemtum mats gives the update for each parameter (i.e. component + * wise). But then what's the hyperparameter for Nesterov? There isn't any? + * BTW for both opts.momentum and opts.nesterov, we can set them to be one + * value or a vector with one per modelmat. If just one value, then it's + * repeated across all modelmats. ALSO, what value to set for opts.langevin? + * + * EDIT: figured it out, opts.nesterov is the same as opts.momentum, assuming + * that we choose to run one of momentum and nesterov's (and not both, which + * wouldn't make sense). + */ override def init(model0:Model) = { + tscale = 1 // Daniel: and had to do this here to make it non-null to start. model = model0 modelmats = model.modelmats; updatemats = model.updatemats; @@ -57,7 +77,13 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda ve <-- opts.vexp; te <-- opts.texp; } + + /** + * Daniel: I don't know what this is supposed to do. However, there is *no* + * mention of any `update2` in the entire BIDMach repository --- I did a + * search. Hence, I think tihs method can be safely ignored. + */ def update2(ipass:Int, step:Long):Unit = { modelmats = model.modelmats; updatemats = model.updatemats; @@ -99,7 +125,26 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda um.clear; } } + + /** + * Whew, the major heavy-hitting for ADAGrad. Let's try to digest this slowly ... + * + * First, it looks like John is defining nsteps and tscale. Not sure what + * these mean... + * + * Second is the heavy-duty part, looping over each model matrix and the other + * matrices involved. What is opts.policies? I'm confused. Anyway, look at the + * CPU version of the case methods. The other version is the GPU, which I'll + * ignore now. + * + * The CPU version is more readable. First, it deals with some logic regarding + * the sum of squares (?). Second, it applies the Langevin dynamics. Third, + * and the crucial part, it applies either momentum updates or nesterov + * updates. I don't *think* we should be using both momentum or nesterov, at + * least based on this code logic. OH, I see the learning rate applied, then + * momentum (or nesterov) and indeed it matches the algorithm formulation. + */ override def update(ipass:Int, step:Long, gprogress:Float):Unit = { val start = toc; modelmats = model.modelmats @@ -112,7 +157,8 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda step / firstStep; } } - val tscale = if (opts.texp.asInstanceOf[AnyRef] != 0) { + // Daniel: had to change this to make it accessible + var tscale = if (opts.texp.asInstanceOf[AnyRef] != 0) { stepn.set(1/(nsteps+1)); stepn ^ te; } else { @@ -121,7 +167,9 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda } val nw = stepn; val nmats = math.min(modelmats.length, updatemats.length); -// println("u sumsq %g" format mini(sumSq(0)).dv) + + // This is applied to each model matrix separately. + // NOTE!! I changed momentum to be momentum(i); see my GitHub issue. for (i <- 0 until nmats) { if (opts.policies.asInstanceOf[AnyRef] != null) { if (opts.policies.length > 1) { @@ -142,10 +190,10 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda case (gmm:GMat, gum:GMat, gss:GMat, gve:GMat, gts:GMat, glrate:GMat) => { if (opts.momentum.asInstanceOf[AnyRef] != null) { val mu = if (opts.momentum.length > 1) opts.momentum(i) else opts.momentum(0); - ADAGrad.ADAGradm(gmm, gum, gss, momentum.asInstanceOf[GMat], mu, mask.asInstanceOf[GMat], nw.dv.toFloat, gve, gts, glrate, opts.langevin, opts.epsilon, (opts.waitsteps < nsteps)); + ADAGrad.ADAGradm(gmm, gum, gss, momentum(i).asInstanceOf[GMat], mu, mask.asInstanceOf[GMat], nw.dv.toFloat, gve, gts, glrate, opts.langevin, opts.epsilon, (opts.waitsteps < nsteps)); } else if (opts.nesterov.asInstanceOf[AnyRef] != null) { val mu = if (opts.nesterov.length > 1) opts.nesterov(i) else opts.nesterov(0); - ADAGrad.ADAGradn(gmm, gum, gss, momentum.asInstanceOf[GMat], mu, mask.asInstanceOf[GMat], nw.dv.toFloat, gve, gts, glrate, opts.langevin, opts.epsilon, (opts.waitsteps < nsteps)); + ADAGrad.ADAGradn(gmm, gum, gss, momentum(i).asInstanceOf[GMat], mu, mask.asInstanceOf[GMat], nw.dv.toFloat, gve, gts, glrate, opts.langevin, opts.epsilon, (opts.waitsteps < nsteps)); } else { ADAGrad.ADAGradx(gmm, gum, gss, mask.asInstanceOf[GMat], nw.dv.toFloat, gve, gts, glrate, opts.langevin, opts.epsilon, (opts.waitsteps < nsteps)); } diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index 8bfc10d9..cefeda8a 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -4,7 +4,9 @@ import BIDMat.{Mat,SBMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat,TM import BIDMat.MatFunctions._ import BIDMat.SciFunctions._ import BIDMach.models._ -import edu.berkeley.bid.CUMACH +import BIDMach.models.Model._ +import edu.berkeley.bid.CUMACH +import BIDMach.Learner /** * Our fast MH test. See: @@ -61,13 +63,33 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater var b:Long = 0 // Current minibatch size (also `b` in the paper). var N:Long = 0 // Maximum minibatch size (i.e. all training data). var n:Float = 0f // The *number* of minibatches we are using. - var logu:Float = 0f // log u, since we assume a symmetric proposer. + var psi:Float = 0f // \psi = log (1 * prop_ratio * prior_ratio) var T:Int = 1 // The temperature of the distribution. - var t:Int = 0 // Current number of samples of theta. + var _t:Int = 0 // Current number of samples of theta. var sumOfValues:Float = 0f // \sum_{i=1}^b (N/T)*log(p(x_i|theta')/p(x_i|theta)). var sumOfSquares:Float = 0f // \sum_{i=1}^b ((N/T)*log(p(x_i|theta')/p(x_i|theta)))^2. var targetVariance:Float = 0f // The target variance (so we only need one X_corr). + // Daniel: experimental, for the SMF. + var currentSizeSMF:Int = -1; + var adagrad:ADAGrad = null; + var tmpMomentum:Array[Mat] = null + var acceptanceRate:Mat = null + var currentUpdatemats:Array[Mat] = null + var proposedUpdatemats:Array[Mat] = null + + var sdata0:GSMat = null + var user:Mat = null + var mm:Mat = null + var iavg:Mat = null + var avg:Mat = null + var uscale:Mat = null + var vexp:Mat = null + var lamu:Mat = null + var slm:Mat = null + var mlm:Mat = null + var firststep = -1f + /** * Standard initialization. We have: @@ -75,6 +97,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater * - n2ld loads the pre-computed X_c variable distribution. * - {delta,proposed,tmp}Theta initialized to zeros with correct dimensions. * - If desired, initialize modelmats with small values to break symmetry. + * - If desired, initialize an internal ADAGrad updater. * * Note that the file for the norm2logdata should be in the correct directory. */ @@ -82,9 +105,21 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater model = model0; modelmats = model.modelmats updatemats = model.updatemats - scores0 = zeros(1,model.datasource.opts.batchSize) - scores1 = zeros(1,model.datasource.opts.batchSize) - diff = zeros(1,model.datasource.opts.batchSize) + acceptanceRate = zeros(1, opts.exitThetaAmount * 10) + if (opts.smf) { + val numnnz = model.asInstanceOf[SMF].getNonzeros() + if (numnnz < 0) { + println("Something wrong happened, numnnz="+numnnz) + sys.exit + } + scores0 = zeros(1, numnnz*10) + scores1 = zeros(1, numnnz*10) + diff = zeros(1, numnnz*10) + } else { + scores0 = zeros(1, model.datasource.opts.batchSize) + scores1 = zeros(1, model.datasource.opts.batchSize) + diff = zeros(1, model.datasource.opts.batchSize) + } T = opts.temp if (opts.Nknown) { @@ -115,14 +150,44 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater modelmats(i) <-- normrnd(0, 0.03f, modelmats(i).nrows, modelmats(i).ncols) } } + + if (opts.smf) { + // This should force adagrad.momentum(i) = momentum(i) in the rest of this code. + adagrad = new ADAGrad(opts.asInstanceOf[ADAGrad.Opts]) + adagrad.init(model) + currentUpdatemats = new Array[Mat](nmats) + proposedUpdatemats = new Array[Mat](nmats) + for (i <- 0 until nmats) { + currentUpdatemats(i) = modelmats(i).zeros(modelmats(i).nrows, modelmats(i).ncols) + proposedUpdatemats(i) = modelmats(i).zeros(modelmats(i).nrows, modelmats(i).ncols) + } + + sdata0 = GSMat(model.datasource.omats(0).asInstanceOf[SMat]) + user = FactorModel.reuseuser(sdata0, model.opts.dim, 0f) + mm = proposedTheta(0) + iavg = proposedTheta(1) + avg = proposedTheta(2) + uscale = mm.zeros(1,1) + vexp = GMat(row(0.5f)) + lamu = mm.ones(opts.asInstanceOf[SMF.Opts].dim, 1) ∘ opts.asInstanceOf[SMF.Opts].lambdau + slm = mm.ones(1,1) ∘ (opts.asInstanceOf[SMF.Opts].lambdam * sdata0.ncols); + mlm = mm.ones(1,1) ∘ (opts.asInstanceOf[SMF.Opts].regmmean * sdata0.ncols); + firststep = -1f + } } - + /** * This performs the update and the MH test based on a minibatch of data. The * original data is split up into equal-sized minibatches in the Learner code. * (The last minibatch is ignored since it generally has a different size.) * + * SMF.scala necessitates extra cases to handle the varying batch sizes. They + * differ across "minibatches" so the scores at "the end" have to be cleared + * (but since scores are only for current MB, just call "clear"), the number + * of nonzeros have to be computed, and then the scores are copied to the + * appropriate interval. EDIT: ugh, never mind, doesn't work ... + * * @param ipass The current pass over the full (training) data. * @param step Progress within the current minibatch, indicated as a numerical * index representing the starting column of this minibatch. @@ -130,29 +195,49 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater * datasource size and the number of (training) passes. */ override def update(ipass:Int, step:Long, gprogress:Float):Unit = { - if (newMinibatch) beforeEachMinibatch() - b += model.datasource.opts.batchSize + if (newMinibatch) beforeEachMinibatch(ipass, step, gprogress) n += 1.0f // (Part 1) Compute scores for theta and theta', scaled by N/T. - scores0 <-- (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) + if (opts.smf) { + currentSizeSMF = model.datasource.omats(0).nnz + b += currentSizeSMF + scores0.clear + scores0(0 -> currentSizeSMF) = (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)).t + } else { + scores0 <-- (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) + b += scores0.length + } if (scores0.length == 1) { throw new RuntimeException("Need individual scores, but getting a scalar.") } + for (i <- 0 until modelmats.length) { modelmats(i) <-- proposedTheta(i) } - scores1 <-- (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) - diff ~ scores1 - scores0 + if (opts.smf) { + scores1.clear + scores1(0 -> currentSizeSMF) = (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)).t + diff.clear + diff(0 -> currentSizeSMF) = scores1(0 -> currentSizeSMF) - scores0(0 -> currentSizeSMF) + } else { + scores1 <-- (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) + diff ~ scores1 - scores0 + } // (Part 2) Update our \Delta* and sample variance of \Delta*. - sumOfSquares += sum((diff)*@(diff)).v - sumOfValues += sum(diff).v - val deltaStar = sumOfValues/b.v - logu - val sampleVariance = (sumOfSquares/b.v - ((sumOfValues/b.v)*(sumOfValues/b.v))) / b.v + if (opts.smf) { + sumOfSquares += sum((diff(0 -> currentSizeSMF)) *@ (diff(0 -> currentSizeSMF))).v + sumOfValues += sum(diff(0 -> currentSizeSMF)).v + } else { + sumOfSquares += sum((diff)*@(diff)).v + sumOfValues += sum(diff).v + } + val deltaStar = sumOfValues/b.v - psi + val sampleVariance = (sumOfSquares/b.v - ((sumOfValues/b.v)*(sumOfValues/b.v))) / b.v val numStd = deltaStar / math.sqrt(sampleVariance) var accept = false - if (opts.verboseMH) debugPrints(sampleVariance, deltaStar) + if (opts.verboseMH) debugPrints(sampleVariance, deltaStar, numStd, sumOfValues/b.v, psi) // (Part 3) Run our test! // (Part 3.1) Take care of the full data case; this usually indicates a problem. @@ -170,7 +255,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater } } // (Part 3.2) Abnormally good or bad minibatches. - else if (math.abs(numStd) > 5.0) { + else if (math.abs(numStd) > 10.0) { if (opts.verboseMH) { println("\tCASE 1: math.abs(numStd) = " +math.abs(numStd)) } @@ -192,7 +277,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater val Xc = normlogrnd(1,1).dv val testStat = deltaStar + Xn + Xc if (opts.verboseMH) { - println("\tCASE 3; with testStat = "+testStat) + println("\tCASE 3; with testStat = %1.4f (Xn = %1.4f, Xc = %1.4f)" format (testStat, Xn, Xc)) } if (testStat > 0) { accept = true @@ -201,35 +286,128 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater // (Part 4) Reset parameters and use <-- to avoid alias problems. if (accept) { + if (opts.verboseMH) println("\tACCEPT") for (i <- 0 until modelmats.length) { tmpTheta(i) <-- modelmats(i) // Now tmpTheta has proposed theta. - } + } + acceptanceRate(_t) = 1 } else { + if (opts.verboseMH) println("\treject") for (i <- 0 until modelmats.length) { modelmats(i) <-- tmpTheta(i) // Now modelmats back to old theta. } + acceptanceRate(_t) = 0 } - if (newMinibatch && accept) afterEachMinibatch() + + if (newMinibatch) afterEachMinibatch(ipass, gprogress) } - + /** * Stuff we should do before each minibatch. This involves calling the * proposer, resetting some values, and saving the current model matrix into - * `tmpTheta` so we can restore it later when needed. + * `tmpTheta` so we can restore it later when needed. Here, we want to set the + * proposer matrices, so that when we continue in uupdate, we have the current + * and proposed model matrices stored in modelmats and proposedTheta, + * respectively. + * + * Also, we have a different (i.e. better!) proposer with ADAGrad. The update + * *should* affect all of the modelmats(i) due to aliasing (since it changes + * adagrad.modelmats(i)). However, this doesn't put it in proposedTheta, so + * here's a workaround: get the modelmats stored into tmpTheta. Then do the + * update, which will update modelmats to the proposed matrices. Then copy + * those into propsoedTheta, and then get current modelmats back to tmpTheta + * (i.e. so modelmats remains the same before and after, and it's just the + * proposedTheta which changes). With momentum, fortunately it's simpler, we + * have that in adagrad.momentum (that's the "proposed" momentum) and simply + * keep the "current" one in tmpMomentum. */ - def beforeEachMinibatch() { + def beforeEachMinibatch(ipass:Int, pos:Long, gprogress:Float) { if (opts.verboseMH) println("\n\tNew minibatch!") - randomWalkProposer() - logu = ln(rand(1,1)).v + for (i <- 0 until modelmats.length) { + tmpTheta(i) <-- modelmats(i) + currentUpdatemats(i) <-- updatemats(i) + } + + // Important, update the model matrices. Also, compute \psi(1,theta,theta'). + // Did computation by hand. A lot of this depends on whether it's SMF or not. + if (opts.smf) { + + // Now modelmats is DIFFERENT! modelmats is now the proposed stuff. Note + // that this does not change the updatemats; that came from SMF.scala. We + // then make proposedTheta contain the proposed stuff! Think of this as + // only providing us with proposedTheta (and keeping modelmats unchanged). + adagrad.update(ipass, pos, gprogress) + for (i <- 0 until modelmats.length) { + proposedTheta(i) <-- adagrad.modelmats(i) // adagrad.modelmats(i) = modelmats(i) + modelmats(i) <-- tmpTheta(i) // Should make adagrad.modelmats(i) back to what it was before. + } + + // Next, with proposedTheta, we now compute the proposed log gradient. I + // think it's easier to copy the relevant mupdate code here. This is very + // specific to SMF.scala, sorry. I actually have to REDO the calls here + // ... oh boy. But fortunately, the `user` matrix gets reset to 0 each + // time for the real SMF, so I can do it here. ALSO, make sure I don't + // update the biases as I would with a call to mupdate in ipass=0. + + sdata0 = GSMat(model.datasource.omats(0).asInstanceOf[SMat]) + user <-- FactorModel.reuseuser(sdata0, model.opts.dim, 0f) + mm <-- proposedTheta(0) + iavg <-- proposedTheta(1) + avg <-- proposedTheta(2) + + // UUPDATE, which means (for instance) predictions are based on *proposed* theta. + if (firststep <= 0) firststep = pos.toFloat; + val step = (pos + firststep)/firststep; + val texp = if (opts.asInstanceOf[Grad.Opts].texp.asInstanceOf[AnyRef] != null) { + opts.asInstanceOf[Grad.Opts].texp.dv + } else { + opts.asInstanceOf[Grad.Opts].pexp.dv + } + uscale.set(opts.asInstanceOf[SMF.Opts].urate * math.pow(ipass+1, - texp).toFloat) + val sdata = sdata0 - GMat(iavg+avg); + val bb = mm * sdata; + val ucounts = sum(sdata0 != 0f); + val uci = (ucounts + 1f) ^ (- vexp); + for (i <- 0 until opts.asInstanceOf[SMF.Opts].uiter) { + val preds = DDS(mm, user, sdata); + val deriv = bb - mm * preds - (user ∘ lamu); + val du = (deriv ∘ uscale ∘ uci); + user ~ user + du; + } + + // MUPDATE + val preds = DDS(proposedTheta(0), user, sdata); + val diffs = sdata + 2f; + diffs.contents ~ sdata.contents - preds.contents; + val icomp = sdata0 != 0f + val icount = sum(sdata0 != 0f, 2); + proposedUpdatemats(1) = (sum(diffs,2) - iavg*mlm) / (icount + 1f); + proposedUpdatemats(2) ~ sum(diffs.contents) / (diffs.contents.length + 1f); + if (firststep <= 0) firststep = pos.toFloat; + proposedUpdatemats(0) = (user *^ diffs - (mm ∘ slm)) / ((icount + 1).t ^ vexp); + + // Finally, we can get the psi terms. + psi = 0f + val tau = FMat(adagrad.lrate).v + val sigma = FMat(adagrad.opts.langevin).v + for (i <- 0 until modelmats.length) { + val term1 = modelmats(i) - proposedTheta(i) - tau*proposedUpdatemats(i) + val term2 = proposedTheta(i) - modelmats(i) - tau*currentUpdatemats(i) + psi += (1f/(2*sigma*sigma)) * ((term1 ddot term1) - (term2 ddot term2)).v + } + } + else { + // Otherwise, it's easy, just do a random walk and set psi = ln(1) = 0. + randomWalkProposer() + psi = 0f + } + newMinibatch = false b = 0 n = 0 sumOfValues = 0f sumOfSquares = 0f - for (i <- 0 until modelmats.length) { - tmpTheta(i) <-- modelmats(i) - } } @@ -239,23 +417,32 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater * logic about the burn-in period, and also exit the program if we reach the * desired number of samples. */ - def afterEachMinibatch() { - t += 1 + def afterEachMinibatch(ipass:Int, gprogress:Float) { + _t += 1 if (opts.collectData) { for (i <- 0 until modelmats.length) { - saveFMat(opts.collectDataDir+ "theta_%d_%04d.fmat.lz4" format (i,t), FMat(modelmats(i))) + saveFMat(opts.collectDataDir+ "theta_%d_%04d.fmat.lz4" format (i,_t), FMat(modelmats(i))) } - saveFMat(opts.collectDataDir+ "data_%04d.fmat.lz4" format (t), FMat(b)) + saveFMat(opts.collectDataDir+ "data_%04d.fmat.lz4" format (_t), FMat(b)) } - if (t == opts.exitThetaAmount && opts.exitTheta) { - println("Exiting code now since t=" +t) + if (_t == opts.exitThetaAmount && opts.exitTheta) { + println("Exiting code now since t=" +_t) sys.exit } - if (t == opts.burnIn) { + if (_t == opts.burnIn) { println("ALERT: Past burn-in period. Now change temperature, proposer, etc.") T = opts.tempAfterBurnin opts.sigmaProposer = opts.sigmaProposerAfterBurnin } + if (opts.smf) { + if (opts.saveAcceptRate && (ipass+1) == opts.asInstanceOf[Learner.Options].npasses + && gprogress > 0.99) { + val lr = adagrad.opts.lrate.v + val lang = adagrad.opts.langevin(0).v + saveMat(opts.acceptRateDir+"arate_%1.4f_%1.3f.mat.lz4" format (lr,lang), + acceptanceRate(0 -> _t)) + } + } } @@ -312,11 +499,19 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater /** This is for debugging. */ - def debugPrints(sampleVariance:Float, deltaStar:Float) { - println("b="+b+", n="+n+", logu="+logu+ ", b-mbSize="+(b - model.datasource.opts.batchSize).toInt) - println("mean(scores0) = "+mean(scores0,2).dv+", mean(scores1) = "+mean(scores1,2).dv) - println("sampleVar = " +sampleVariance) - println("delta* = " + deltaStar) + def debugPrints(sampleVariance:Float, deltaStar:Float, numStd:Double, sumDivB:Float, psi:Float) { + val s1 = mean(scores1(0 -> b.toInt)).dv + val s0 = mean(scores0(0 -> b.toInt)).dv + println("b = %d, n = %d" format (b, n.toInt)) + println("mean(scores1) (%1.4f) - mean(scores0) (%1.4f) = %1.4f" format (s1, s0, s1-s0)) + println("maxi(scores1) = "+maxi(scores1(0 -> b.toInt))+", maxi(scores0) = "+maxi(scores0(0 -> b.toInt))) + println("mini(scores1) = "+mini(scores1(0 -> b.toInt))+", mini(scores0) = "+mini(scores0(0 -> b.toInt))) + if (opts.smf) { + println("delta^* (%1.4f) = sumDivB (%1.4f) - psi (%1.4f)" format (deltaStar, sumDivB, psi)) + println("sampleVar = %1.4f, numStd = %1.4f" format (sampleVariance, numStd)) + } else { + println("delta^* = %1.4f, sampleVar = %1.4f, numStd = %1.4f" format (deltaStar, sampleVariance, numStd)) + } } } @@ -341,6 +536,9 @@ object MHTest { var exitThetaAmount = 3000 var initThetaHere = false var burnIn = -1 + var smf = false + var saveAcceptRate = false + var acceptRateDir = "tmp/" } class Options extends Opts {} diff --git a/src/main/scala/BIDMach/updaters/Updater.scala b/src/main/scala/BIDMach/updaters/Updater.scala index 614f67e6..fe34b9d6 100755 --- a/src/main/scala/BIDMach/updaters/Updater.scala +++ b/src/main/scala/BIDMach/updaters/Updater.scala @@ -19,7 +19,7 @@ abstract class Updater(val opts:Updater.Opts = new Updater.Options) extends Seri def update(ipass:Int, step:Long):Unit = {} def update(ipass:Int, step:Long, gprogress:Float):Unit = update(ipass, step) - + def updateM(ipass:Int):Unit = { model.updatePass(ipass) }