ActivitySim · bstabler · Jun 19, 2020 · May 14, 2020 · Jun 1, 2020 · Jun 11, 2020
diff --git a/populationsim/balancer.py b/populationsim/balancer.py
@@ -208,7 +208,7 @@ def np_balancer(
                     yy + relaxed_constraint / float(importance))
 
             # update HH weights
-            weights_final[incidence[c] > 0] *= gamma[c]
+            weights_final *= pow(gamma[c], incidence[c])
 
             # clip weights to upper and lower bounds
             weights_final = np.clip(weights_final, weights_lower_bound, weights_upper_bound)

diff --git a/populationsim/simul_balancer.py b/populationsim/simul_balancer.py
@@ -238,7 +238,7 @@ def np_simul_balancer(
                         yy + (relaxed_constraint / float(importance)))
 
                 # update HH weights
-                sub_weights[z][incidence[c] > 0] *= gamma[z, c]
+                sub_weights[z] *= pow(gamma[z, c], incidence[c])
 
                 # clip weights to upper and lower bounds
                 sub_weights[z] = np.clip(sub_weights[z], weights_lower_bound, weights_upper_bound)

diff --git a/populationsim/steps/sub_balancing.py b/populationsim/steps/sub_balancing.py
@@ -252,9 +252,12 @@ def sub_balancing(settings, crosswalk, control_spec, incidence_table):
         # only want ones for which there are (non-zero) controls
         parent_ids = parent_controls_df.index.intersection(parent_ids)
 
-        for parent_id in parent_ids:
+        num_parent_ids = len(parent_ids)
+        for idx, parent_id in enumerate(parent_ids, start=1):
 
-            logger.info("balancing seed %s, %s %s" % (seed_id, parent_geography, parent_id))
+            log_msg = "balancing {}/{} seed {}, {} {}"
+            log_msg = log_msg.format(idx, num_parent_ids, seed_id, parent_geography, parent_id)
+            logger.info(log_msg)
 
             initial_weights = weights_df[weights_df[parent_geography] == parent_id]
             initial_weights = initial_weights.set_index(settings.get('household_id_col'))

diff --git a/populationsim/tests/test_steps.py b/populationsim/tests/test_steps.py
@@ -38,7 +38,7 @@ def teardown_function(func):
 
 
 TAZ_COUNT = 36
-TAZ_100_HH_COUNT = 25
+TAZ_100_HH_COUNT = 33
 TAZ_100_HH_REPOP_COUNT = 26
 
 
@@ -51,7 +51,7 @@ def test_full_run1():
         'meta_control_factoring',
         'final_seed_balancing',
         'integerize_final_seed_weights',
-        'sub_balancing.geography = TRACT',
+        'sub_balancing.geography=TRACT',
         'sub_balancing.geography=TAZ',
         'expand_households',
         'summarize',

diff --git a/scripts/validation.ipynb b/scripts/validation.ipynb
@@ -93,7 +93,7 @@
     "    packages = 'pyyaml pandas numpy matplotlib'\n",
     "    ret = os.system(f'conda install {packages}')\n",
     "    if ret != 0:\n",
-    "        os.system(f'pip install {packages}')"
+    "        os.system(f'{sys.executable} -m pip install {packages}')"
    ]
   },
   {
@@ -840,7 +840,7 @@
     }
    ],
    "source": [
-    "def meta_geog_df(meta_geog):\n",
+    "def meta_geog_df(summary_df, meta_geog):\n",
     "    geography_df = pd.read_csv(os.path.join(popsim_dir, geography_file))\n",
     "    geog = use_geographies[use_geographies.index(meta_geog) + 1]  # next geography in list\n",
     "    meta_df = geography_df[[meta_geog, geog]].drop_duplicates(ignore_index=True)\n",
@@ -852,7 +852,7 @@
     "    \n",
     "for geog in use_geographies:\n",
     "    if not geog in summary_df.geography.unique():\n",
-    "        summary_df = summary_df.append(meta_geog_df(geog))\n",
+    "        summary_df = summary_df.append(summary_df, meta_geog_df(geog))\n",
     "\n",
     "summary_df.tail()"
    ]
@@ -870,7 +870,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def process_control(name, geography, control, result):\n",
+    "def process_control(summary_df, name, geography, control, result):\n",
     "    \"\"\"\n",
     "    Global\n",
     "    ------\n",
@@ -948,12 +948,12 @@
     "\n",
     "stats = []\n",
     "for params, ax in zip(aggregate_list, axes.ravel()):\n",
-    "    s, f = process_control(**params)\n",
+    "    s, f = process_control(summary_df, **params)\n",
     "    stats.append(s)\n",
     "    \n",
     "    ax.set_title(f\"{params['geography']} - {params['name']}\")\n",
     "    ax.set_ylabel('Frequency'); ax.set_xlabel('Difference: control vs. result')\n",
-    "    ax.scatter(f.index, f)\n",
+    "    ax.hist(diff, bins=10, range=(-5,5), alpha=0.5)\n",
     "\n",
     "summary_fig.savefig(os.path.join(validation_dir, 'frequencies.pdf'))\n",
     "stats_df = pd.DataFrame(stats)\n",