diff --git a/docs/chadwick_register.md b/docs/chadwick_register.md new file mode 100644 index 0000000..a8d5a4d --- /dev/null +++ b/docs/chadwick_register.md @@ -0,0 +1,23 @@ +# Player ID Lookup + +`chadwick_register(save: bool = False)` + +Get the Chadwick register people info. + +## Arguments + +`save:` bool. Save the file to disk. + + + +## Example + +```python +from pybaseball import chadwick_register + +# get the register data +data = chadwick_register() + +# get the register data and save to disk +data = chadwick_register(save=True) +``` diff --git a/pybaseball/__init__.py b/pybaseball/__init__.py index 0217574..c965b31 100644 --- a/pybaseball/__init__.py +++ b/pybaseball/__init__.py @@ -1,6 +1,7 @@ import pybaseball.utils from .playerid_lookup import playerid_reverse_lookup from .playerid_lookup import playerid_lookup +from .playerid_lookup import chadwick_register from .statcast import statcast, statcast_single_game from .statcast_pitcher import statcast_pitcher from .statcast_batter import statcast_batter, statcast_batter_exitvelo_barrels diff --git a/pybaseball/playerid_lookup.py b/pybaseball/playerid_lookup.py index 0eb9a0e..780b8d2 100644 --- a/pybaseball/playerid_lookup.py +++ b/pybaseball/playerid_lookup.py @@ -1,26 +1,48 @@ import pandas as pd import requests import io +import os -# dropped key_uuid. looks like a has we wouldn't need for anything. -# TODO: allow for typos. String similarity? +# dropped key_uuid. looks like a has we wouldn't need for anything. +# TODO: allow for typos. String similarity? +url = "https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv" +register_file = 'chadwick-register.csv' + +def chadwick_register(save: bool = False) -> pd.DataFrame: + ''' Get the Chadwick register Database ''' + + if os.path.exists(register_file): + table = pd.read_csv(register_file) + return table -def get_lookup_table(): print('Gathering player lookup table. This may take a moment.') - url = "https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv" - s=requests.get(url).content - table = pd.read_csv(io.StringIO(s.decode('utf-8')), dtype={'key_sr_nfl': object, 'key_sr_nba': object, 'key_sr_nhl': object}) - #subset columns - cols_to_keep = ['name_last','name_first','key_mlbam', 'key_retro', 'key_bbref', 'key_fangraphs', 'mlb_played_first','mlb_played_last'] + s = requests.get(url).content + mlb_only_cols = ['key_retro', 'key_bbref', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last'] + cols_to_keep = ['name_last', 'name_first', 'key_mlbam'] + mlb_only_cols + table = pd.read_csv(io.StringIO(s.decode('utf-8')), usecols=cols_to_keep) + + table.dropna(how='all', subset=mlb_only_cols, inplace=True) # Keep only the major league rows + table.reset_index(inplace=True, drop=True) + + table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].fillna(-1) + # originally returned as floats which is wrong + table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].astype(int) + + # Reorder the columns to the right order table = table[cols_to_keep] + + if save: + table.to_csv(register_file, index=False) + + return table + + +def get_lookup_table(save=False): + table = chadwick_register(save) #make these lowercase to avoid capitalization mistakes when searching table['name_last'] = table['name_last'].str.lower() table['name_first'] = table['name_first'].str.lower() - # Pandas cannot handle NaNs in integer columns. We need IDs to be ints for successful queries in statcast, etc. - # Workaround: replace ID NaNs with -1, then convert columns to integers. User will have to understand that -1 is not a valid ID. - table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].fillna(-1) - table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].astype(int) # originally returned as floats which is wrong return table @@ -31,7 +53,7 @@ def playerid_lookup(last=None, first=None, player_list=None): if first: first = first.lower() table = get_lookup_table() - + # if player_list has a value, then the user is passing in a list of players # the list of players may be comma delimited for last, first, or just last if player_list: @@ -39,24 +61,24 @@ def playerid_lookup(last=None, first=None, player_list=None): for player in player_list: last = player.split(",")[0].strip() first = None - if(len(player.split(",")) > 1): + if (len(player.split(",")) > 1): first = player.split(",")[1].strip() - if(player_counter == 1): + if (player_counter == 1): results = playerid_lookup(last, first) else: - results = results.append(playerid_lookup(last, first), ignore_index = True) + results = results.append(playerid_lookup(last, first), ignore_index=True) player_counter += 1 return results - - + if first is None: - results = table.loc[table['name_last']==last] + results = table.loc[table['name_last'] == last] else: - results = table.loc[(table['name_last']==last) & (table['name_first']==first)] + results = table.loc[(table['name_last'] == last) & (table['name_first'] == first)] #results[['key_mlbam', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']] = results[['key_mlbam', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']].astype(int) # originally returned as floats which is wrong results = results.reset_index().drop('index', 1) return results + # data = playerid_lookup('bonilla') # data = playerid_lookup('bonilla', 'bobby') @@ -71,14 +93,18 @@ def playerid_reverse_lookup(player_ids, key_type=None): :rtype: :class:`pandas.core.frame.DataFrame` """ - key_types = ('mlbam', 'retro', 'bbref', 'fangraphs', ) + key_types = ( + 'mlbam', + 'retro', + 'bbref', + 'fangraphs', + ) if not key_type: - key_type = key_types[0] # default is "mlbam" if key_type not provided + key_type = key_types[0] # default is "mlbam" if key_type not provided elif key_type not in key_types: - raise ValueError( - '[Key Type: {}] Invalid; Key Type must be one of "{}"'.format(key_type, '", "'.join(key_types)) - ) + raise ValueError('[Key Type: {}] Invalid; Key Type must be one of "{}"'.format( + key_type, '", "'.join(key_types))) table = get_lookup_table() key = 'key_{}'.format(key_type)