schorrm · schorrm · Jun 2, 2020 · May 24, 2020 · May 24, 2020 · Jun 2, 2020
diff --git a/docs/chadwick_register.md b/docs/chadwick_register.md
@@ -0,0 +1,23 @@
+# Player ID Lookup
+
+`chadwick_register(save: bool = False)`
+
+Get the Chadwick register people info.
+
+## Arguments
+
+`save:` bool. Save the file to disk.
+
+<!-- This data comes from Chadwick Bureau, meaning that there are several people in this data who are not MLB players. For this reason, supplying both last and first name is recommended to narrow your search.  -->
+
+## Example
+
+```python
+from pybaseball import chadwick_register
+
+# get the register data
+data = chadwick_register()
+
+# get the register data and save to disk
+data = chadwick_register(save=True)
+```
diff --git a/pybaseball/__init__.py b/pybaseball/__init__.py
@@ -1,6 +1,7 @@
 import pybaseball.utils
 from .playerid_lookup import playerid_reverse_lookup
 from .playerid_lookup import playerid_lookup
+from .playerid_lookup import chadwick_register
 from .statcast import statcast, statcast_single_game
 from .statcast_pitcher import statcast_pitcher
 from .statcast_batter import statcast_batter, statcast_batter_exitvelo_barrels

diff --git a/pybaseball/playerid_lookup.py b/pybaseball/playerid_lookup.py
@@ -1,26 +1,48 @@
 import pandas as pd
 import requests
 import io
+import os
 
-# dropped key_uuid. looks like a has we wouldn't need for anything. 
-# TODO: allow for typos. String similarity? 
+# dropped key_uuid. looks like a has we wouldn't need for anything.
+# TODO: allow for typos. String similarity?
 
+url = "https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv"
+register_file = 'chadwick-register.csv'
+
+def chadwick_register(save: bool = False) -> pd.DataFrame:
+    ''' Get the Chadwick register Database '''
+
+    if os.path.exists(register_file):
+        table = pd.read_csv(register_file)
+        return table
 
-def get_lookup_table():
     print('Gathering player lookup table. This may take a moment.')
-    url = "https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv"
-    s=requests.get(url).content
-    table = pd.read_csv(io.StringIO(s.decode('utf-8')), dtype={'key_sr_nfl': object, 'key_sr_nba': object, 'key_sr_nhl': object})
-    #subset columns
-    cols_to_keep = ['name_last','name_first','key_mlbam', 'key_retro', 'key_bbref', 'key_fangraphs', 'mlb_played_first','mlb_played_last']
+    s = requests.get(url).content
+    mlb_only_cols = ['key_retro', 'key_bbref', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']
+    cols_to_keep = ['name_last', 'name_first', 'key_mlbam'] + mlb_only_cols
+    table = pd.read_csv(io.StringIO(s.decode('utf-8')), usecols=cols_to_keep)
+
+    table.dropna(how='all', subset=mlb_only_cols, inplace=True) # Keep only the major league rows
+    table.reset_index(inplace=True, drop=True)
+
+    table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].fillna(-1)
+    # originally returned as floats which is wrong
+    table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].astype(int)
+
+    # Reorder the columns to the right order
     table = table[cols_to_keep]
+
+    if save:
+        table.to_csv(register_file, index=False)
+
+    return table
+
+
+def get_lookup_table(save=False):
+    table = chadwick_register(save)
     #make these lowercase to avoid capitalization mistakes when searching
     table['name_last'] = table['name_last'].str.lower()
     table['name_first'] = table['name_first'].str.lower()
-    # Pandas cannot handle NaNs in integer columns. We need IDs to be ints for successful queries in statcast, etc. 
-    # Workaround: replace ID NaNs with -1, then convert columns to integers. User will have to understand that -1 is not a valid ID. 
-    table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].fillna(-1)
-    table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].astype(int) # originally returned as floats which is wrong
     return table
 
 
@@ -31,32 +53,32 @@ def playerid_lookup(last=None, first=None, player_list=None):
     if first:
         first = first.lower()
     table = get_lookup_table()
-    
+
     # if player_list has a value, then the user is passing in a list of players
     # the list of players may be comma delimited for last, first, or just last
     if player_list:
         player_counter = 1
         for player in player_list:
             last = player.split(",")[0].strip()
             first = None
-            if(len(player.split(",")) > 1):
+            if (len(player.split(",")) > 1):
                 first = player.split(",")[1].strip()
-            if(player_counter == 1):
+            if (player_counter == 1):
                 results = playerid_lookup(last, first)
             else:
-                results = results.append(playerid_lookup(last, first), ignore_index = True)
+                results = results.append(playerid_lookup(last, first), ignore_index=True)
             player_counter += 1
         return results
-
-
+
     if first is None:
-        results = table.loc[table['name_last']==last]
+        results = table.loc[table['name_last'] == last]
     else:
-        results = table.loc[(table['name_last']==last) & (table['name_first']==first)]
+        results = table.loc[(table['name_last'] == last) & (table['name_first'] == first)]
     #results[['key_mlbam', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']] = results[['key_mlbam', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']].astype(int) # originally returned as floats which is wrong
     results = results.reset_index().drop('index', 1)
     return results
 
+
 # data = playerid_lookup('bonilla')
 # data = playerid_lookup('bonilla', 'bobby')
 
@@ -71,14 +93,18 @@ def playerid_reverse_lookup(player_ids, key_type=None):
 
     :rtype: :class:`pandas.core.frame.DataFrame`
     """
-    key_types = ('mlbam', 'retro', 'bbref', 'fangraphs', )
+    key_types = (
+        'mlbam',
+        'retro',
+        'bbref',
+        'fangraphs',
+    )
 
     if not key_type:
-        key_type = key_types[0]     # default is "mlbam" if key_type not provided
+        key_type = key_types[0]  # default is "mlbam" if key_type not provided
     elif key_type not in key_types:
-        raise ValueError(
-            '[Key Type: {}] Invalid; Key Type must be one of "{}"'.format(key_type, '", "'.join(key_types))
-        )
+        raise ValueError('[Key Type: {}] Invalid; Key Type must be one of "{}"'.format(
+            key_type, '", "'.join(key_types)))
 
     table = get_lookup_table()
     key = 'key_{}'.format(key_type)