Skip to content

Commit

Permalink
Chadwick Lookup Table (#7)
Browse files Browse the repository at this point in the history
* Chadwick Register functionality
* Lookup Table
  • Loading branch information
schorrm authored Jun 2, 2020
1 parent 9a80484 commit 4679d82
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 25 deletions.
23 changes: 23 additions & 0 deletions docs/chadwick_register.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Player ID Lookup

`chadwick_register(save: bool = False)`

Get the Chadwick register people info.

## Arguments

`save:` bool. Save the file to disk.

<!-- This data comes from Chadwick Bureau, meaning that there are several people in this data who are not MLB players. For this reason, supplying both last and first name is recommended to narrow your search. -->

## Example

```python
from pybaseball import chadwick_register

# get the register data
data = chadwick_register()

# get the register data and save to disk
data = chadwick_register(save=True)
```
1 change: 1 addition & 0 deletions pybaseball/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pybaseball.utils
from .playerid_lookup import playerid_reverse_lookup
from .playerid_lookup import playerid_lookup
from .playerid_lookup import chadwick_register
from .statcast import statcast, statcast_single_game
from .statcast_pitcher import statcast_pitcher
from .statcast_batter import statcast_batter, statcast_batter_exitvelo_barrels
Expand Down
76 changes: 51 additions & 25 deletions pybaseball/playerid_lookup.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,48 @@
import pandas as pd
import requests
import io
import os

# dropped key_uuid. looks like a has we wouldn't need for anything.
# TODO: allow for typos. String similarity?
# dropped key_uuid. looks like a has we wouldn't need for anything.
# TODO: allow for typos. String similarity?

url = "https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv"
register_file = 'chadwick-register.csv'

def chadwick_register(save: bool = False) -> pd.DataFrame:
''' Get the Chadwick register Database '''

if os.path.exists(register_file):
table = pd.read_csv(register_file)
return table

def get_lookup_table():
print('Gathering player lookup table. This may take a moment.')
url = "https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv"
s=requests.get(url).content
table = pd.read_csv(io.StringIO(s.decode('utf-8')), dtype={'key_sr_nfl': object, 'key_sr_nba': object, 'key_sr_nhl': object})
#subset columns
cols_to_keep = ['name_last','name_first','key_mlbam', 'key_retro', 'key_bbref', 'key_fangraphs', 'mlb_played_first','mlb_played_last']
s = requests.get(url).content
mlb_only_cols = ['key_retro', 'key_bbref', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']
cols_to_keep = ['name_last', 'name_first', 'key_mlbam'] + mlb_only_cols
table = pd.read_csv(io.StringIO(s.decode('utf-8')), usecols=cols_to_keep)

table.dropna(how='all', subset=mlb_only_cols, inplace=True) # Keep only the major league rows
table.reset_index(inplace=True, drop=True)

table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].fillna(-1)
# originally returned as floats which is wrong
table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].astype(int)

# Reorder the columns to the right order
table = table[cols_to_keep]

if save:
table.to_csv(register_file, index=False)

return table


def get_lookup_table(save=False):
table = chadwick_register(save)
#make these lowercase to avoid capitalization mistakes when searching
table['name_last'] = table['name_last'].str.lower()
table['name_first'] = table['name_first'].str.lower()
# Pandas cannot handle NaNs in integer columns. We need IDs to be ints for successful queries in statcast, etc.
# Workaround: replace ID NaNs with -1, then convert columns to integers. User will have to understand that -1 is not a valid ID.
table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].fillna(-1)
table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].astype(int) # originally returned as floats which is wrong
return table


Expand All @@ -31,32 +53,32 @@ def playerid_lookup(last=None, first=None, player_list=None):
if first:
first = first.lower()
table = get_lookup_table()

# if player_list has a value, then the user is passing in a list of players
# the list of players may be comma delimited for last, first, or just last
if player_list:
player_counter = 1
for player in player_list:
last = player.split(",")[0].strip()
first = None
if(len(player.split(",")) > 1):
if (len(player.split(",")) > 1):
first = player.split(",")[1].strip()
if(player_counter == 1):
if (player_counter == 1):
results = playerid_lookup(last, first)
else:
results = results.append(playerid_lookup(last, first), ignore_index = True)
results = results.append(playerid_lookup(last, first), ignore_index=True)
player_counter += 1
return results



if first is None:
results = table.loc[table['name_last']==last]
results = table.loc[table['name_last'] == last]
else:
results = table.loc[(table['name_last']==last) & (table['name_first']==first)]
results = table.loc[(table['name_last'] == last) & (table['name_first'] == first)]
#results[['key_mlbam', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']] = results[['key_mlbam', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']].astype(int) # originally returned as floats which is wrong
results = results.reset_index().drop('index', 1)
return results


# data = playerid_lookup('bonilla')
# data = playerid_lookup('bonilla', 'bobby')

Expand All @@ -71,14 +93,18 @@ def playerid_reverse_lookup(player_ids, key_type=None):
:rtype: :class:`pandas.core.frame.DataFrame`
"""
key_types = ('mlbam', 'retro', 'bbref', 'fangraphs', )
key_types = (
'mlbam',
'retro',
'bbref',
'fangraphs',
)

if not key_type:
key_type = key_types[0] # default is "mlbam" if key_type not provided
key_type = key_types[0] # default is "mlbam" if key_type not provided
elif key_type not in key_types:
raise ValueError(
'[Key Type: {}] Invalid; Key Type must be one of "{}"'.format(key_type, '", "'.join(key_types))
)
raise ValueError('[Key Type: {}] Invalid; Key Type must be one of "{}"'.format(
key_type, '", "'.join(key_types)))

table = get_lookup_table()
key = 'key_{}'.format(key_type)
Expand Down

0 comments on commit 4679d82

Please sign in to comment.