Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chadwick Lookup Table #7

Merged
merged 3 commits into from
Jun 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions docs/chadwick_register.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Player ID Lookup

`chadwick_register(save: bool = False)`

Get the Chadwick register people info.

## Arguments

`save:` bool. Save the file to disk.

<!-- This data comes from Chadwick Bureau, meaning that there are several people in this data who are not MLB players. For this reason, supplying both last and first name is recommended to narrow your search. -->

## Example

```python
from pybaseball import chadwick_register

# get the register data
data = chadwick_register()

# get the register data and save to disk
data = chadwick_register(save=True)
```
1 change: 1 addition & 0 deletions pybaseball/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pybaseball.utils
from .playerid_lookup import playerid_reverse_lookup
from .playerid_lookup import playerid_lookup
from .playerid_lookup import chadwick_register
from .statcast import statcast, statcast_single_game
from .statcast_pitcher import statcast_pitcher
from .statcast_batter import statcast_batter, statcast_batter_exitvelo_barrels
Expand Down
76 changes: 51 additions & 25 deletions pybaseball/playerid_lookup.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,48 @@
import pandas as pd
import requests
import io
import os

# dropped key_uuid. looks like a has we wouldn't need for anything.
# TODO: allow for typos. String similarity?
# dropped key_uuid. looks like a has we wouldn't need for anything.
# TODO: allow for typos. String similarity?

url = "https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv"
register_file = 'chadwick-register.csv'

def chadwick_register(save: bool = False) -> pd.DataFrame:
''' Get the Chadwick register Database '''

if os.path.exists(register_file):
table = pd.read_csv(register_file)
return table

def get_lookup_table():
print('Gathering player lookup table. This may take a moment.')
url = "https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv"
s=requests.get(url).content
table = pd.read_csv(io.StringIO(s.decode('utf-8')), dtype={'key_sr_nfl': object, 'key_sr_nba': object, 'key_sr_nhl': object})
#subset columns
cols_to_keep = ['name_last','name_first','key_mlbam', 'key_retro', 'key_bbref', 'key_fangraphs', 'mlb_played_first','mlb_played_last']
s = requests.get(url).content
mlb_only_cols = ['key_retro', 'key_bbref', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']
cols_to_keep = ['name_last', 'name_first', 'key_mlbam'] + mlb_only_cols
table = pd.read_csv(io.StringIO(s.decode('utf-8')), usecols=cols_to_keep)

table.dropna(how='all', subset=mlb_only_cols, inplace=True) # Keep only the major league rows
table.reset_index(inplace=True, drop=True)

table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].fillna(-1)
# originally returned as floats which is wrong
table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].astype(int)

# Reorder the columns to the right order
table = table[cols_to_keep]

if save:
table.to_csv(register_file, index=False)

return table


def get_lookup_table(save=False):
table = chadwick_register(save)
#make these lowercase to avoid capitalization mistakes when searching
table['name_last'] = table['name_last'].str.lower()
table['name_first'] = table['name_first'].str.lower()
# Pandas cannot handle NaNs in integer columns. We need IDs to be ints for successful queries in statcast, etc.
# Workaround: replace ID NaNs with -1, then convert columns to integers. User will have to understand that -1 is not a valid ID.
table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].fillna(-1)
table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].astype(int) # originally returned as floats which is wrong
return table


Expand All @@ -31,32 +53,32 @@ def playerid_lookup(last=None, first=None, player_list=None):
if first:
first = first.lower()
table = get_lookup_table()

# if player_list has a value, then the user is passing in a list of players
# the list of players may be comma delimited for last, first, or just last
if player_list:
player_counter = 1
for player in player_list:
last = player.split(",")[0].strip()
first = None
if(len(player.split(",")) > 1):
if (len(player.split(",")) > 1):
first = player.split(",")[1].strip()
if(player_counter == 1):
if (player_counter == 1):
results = playerid_lookup(last, first)
else:
results = results.append(playerid_lookup(last, first), ignore_index = True)
results = results.append(playerid_lookup(last, first), ignore_index=True)
player_counter += 1
return results



if first is None:
results = table.loc[table['name_last']==last]
results = table.loc[table['name_last'] == last]
else:
results = table.loc[(table['name_last']==last) & (table['name_first']==first)]
results = table.loc[(table['name_last'] == last) & (table['name_first'] == first)]
#results[['key_mlbam', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']] = results[['key_mlbam', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']].astype(int) # originally returned as floats which is wrong
results = results.reset_index().drop('index', 1)
return results


# data = playerid_lookup('bonilla')
# data = playerid_lookup('bonilla', 'bobby')

Expand All @@ -71,14 +93,18 @@ def playerid_reverse_lookup(player_ids, key_type=None):

:rtype: :class:`pandas.core.frame.DataFrame`
"""
key_types = ('mlbam', 'retro', 'bbref', 'fangraphs', )
key_types = (
'mlbam',
'retro',
'bbref',
'fangraphs',
)

if not key_type:
key_type = key_types[0] # default is "mlbam" if key_type not provided
key_type = key_types[0] # default is "mlbam" if key_type not provided
elif key_type not in key_types:
raise ValueError(
'[Key Type: {}] Invalid; Key Type must be one of "{}"'.format(key_type, '", "'.join(key_types))
)
raise ValueError('[Key Type: {}] Invalid; Key Type must be one of "{}"'.format(
key_type, '", "'.join(key_types)))

table = get_lookup_table()
key = 'key_{}'.format(key_type)
Expand Down