Skip to content

Commit

Permalink
Adapt for BR now supporting Negro Leagues (#215)
Browse files Browse the repository at this point in the history
  • Loading branch information
TheCleric authored Jul 1, 2021
1 parent ca4d8ca commit 8a8e398
Show file tree
Hide file tree
Showing 8 changed files with 3,066 additions and 3,033 deletions.
5,860 changes: 2,905 additions & 2,955 deletions pybaseball/data/fangraphs_teams.csv

Large diffs are not rendered by default.

26 changes: 13 additions & 13 deletions pybaseball/standings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
from bs4 import BeautifulSoup, Comment, PageElement, ResultSet

from . import cache
from .utils import most_recent_season
Expand All @@ -16,22 +16,22 @@ def get_soup(year: int) -> BeautifulSoup:
def get_tables(soup: BeautifulSoup, season: int) -> List[pd.DataFrame]:
datasets = []
if season >= 1969:
tables = soup.find_all('table')
tables: List[PageElement] = soup.find_all('table')
if season == 1981:
# For some reason BRef has 1981 broken down by halves and overall
# https://www.baseball-reference.com/leagues/MLB/1981-standings.shtml
tables = [x for x in tables if 'overall' in x.get('id', '')]
for table in tables:
data = []
headings = [th.get_text() for th in table.find("tr").find_all("th")]
headings: List[PageElement] = [th.get_text() for th in table.find("tr").find_all("th")]
data.append(headings)
table_body = table.find('tbody')
rows = table_body.find_all('tr')
table_body: PageElement = table.find('tbody')
rows: List[PageElement] = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
cols.insert(0,row.find_all('a')[0]['title']) # team name
data.append([ele for ele in cols if ele])
cols: List[PageElement] = row.find_all('td')
cols_text: List[str] = [ele.text.strip() for ele in cols]
cols_text.insert(0, row.find_all('a')[0].text.strip()) # team name
data.append([ele for ele in cols_text if ele])
datasets.append(data)
else:
data = []
Expand Down Expand Up @@ -64,7 +64,7 @@ def get_tables(soup: BeautifulSoup, season: int) -> List[pd.DataFrame]:
for _ in range(16):
cols.pop()
cols = [ele.text.strip() for ele in cols]
cols.insert(0,row.find_all('a')[0]['title']) # team name
cols.insert(0,row.find_all('a')[0].text.strip()) # team name
data.append([ele for ele in cols if ele])
datasets.append(data)
#convert list-of-lists to dataframes
Expand All @@ -78,10 +78,10 @@ def standings(season:Optional[int] = None) -> pd.DataFrame:
# get most recent standings if date not specified
if season is None:
season = most_recent_season()
if season < 1871:
if season < 1876:
raise ValueError(
"This query currently only returns standings until the 1871 season. "
"Try looking at years from 1871 to present."
"This query currently only returns standings until the 1876 season. "
"Try looking at years from 1876 to present."
)

# retrieve html from baseball reference
Expand Down
33 changes: 18 additions & 15 deletions pybaseball/team_results.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
from datetime import datetime
from typing import Optional

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

from pybaseball.utils import first_season_map, most_recent_season
from pybaseball.utils import get_first_season, most_recent_season

from . import cache

# TODO: retrieve data for all teams? a full season's worth of results

def get_soup(season, team):
def get_soup(season: Optional[int], team: str) -> BeautifulSoup:
# get most recent year's schedule if year not specified
if season is None:
season = most_recent_season()
url = "http://www.baseball-reference.com/teams/{}/{}-schedule-scores.shtml".format(team, season)
s=requests.get(url).content
print(url)
s = requests.get(url).content
return BeautifulSoup(s, "lxml")

def get_table(soup,team):
def get_table(soup: BeautifulSoup, team: str) -> pd.DataFrame:
try:
table = soup.find_all('table')[0]
except:
Expand Down Expand Up @@ -64,14 +66,14 @@ def get_table(soup,team):
cols = [ele.text.strip() for ele in cols][0:5]
data.append([ele for ele in cols if ele])
#convert to pandas dataframe. make first row the table's column names and reindex.
data = pd.DataFrame(data)
data = data.rename(columns=data.iloc[0])
data = data.reindex(data.index.drop(0))
data = data.drop('',1) #not a useful column
data['Attendance'].replace(r'^Unknown$', np.nan, regex=True, inplace = True) # make this a NaN so the column can benumeric
return data
df = pd.DataFrame(data)
df = df.rename(columns=df.iloc[0])
df = df.reindex(df.index.drop(0))
df = df.drop('',1) #not a useful column
df['Attendance'].replace(r'^Unknown$', np.nan, regex=True, inplace = True) # make this a NaN so the column can benumeric
return df

def process_win_streak(data):
def process_win_streak(data: pd.DataFrame) -> pd.DataFrame:
"""
Convert "+++"/"---" formatted win/loss streak column into a +/- integer column
"""
Expand All @@ -83,7 +85,7 @@ def process_win_streak(data):
data = data.drop('Streak2',1)
return data

def make_numeric(data):
def make_numeric(data: pd.DataFrame) -> pd.DataFrame:
# first remove commas from attendance values
# skip if column is all NA (not sure if everyone kept records in the early days)
if data['Attendance'].count() > 0:
Expand All @@ -98,12 +100,13 @@ def make_numeric(data):
return data

@cache.df_cache()
def schedule_and_record(season=None, team=None):
def schedule_and_record(season: int, team: str) -> pd.DataFrame:
# retrieve html from baseball reference
# sanatize input
team = team.upper()
try:
if season < first_season_map[team]:
first_season = get_first_season(team)
if first_season is None or season < first_season:
m = "Season cannot be before first year of a team's existence"
raise ValueError(m)
# ignore validation if team isn't found in dictionary
Expand Down
4 changes: 2 additions & 2 deletions pybaseball/teamid_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,13 @@ def _generate_teams() -> pd.DataFrame:
Should only need to be run when a team is added, removed, or moves to a new city.
"""

start_season = 1871
start_season = 1876
end_season = most_recent_season()

lahman_columns = ['yearID', 'lgID', 'teamID', 'franchID', 'divID', 'name', 'teamIDBR', 'teamIDlahman45',
'teamIDretro']

lahman_teams = lahman.teams()[lahman_columns]
lahman_teams = lahman.teams().query('yearID >= @start_season')[lahman_columns]

# Only getting AB to make payload small, and you have to specify at least one column
fg_team_data = fangraphs.fg_team_batting_data(start_season, end_season, "ALL", stat_columns=['AB'])
Expand Down
137 changes: 100 additions & 37 deletions pybaseball/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from collections import namedtuple
from datetime import date, datetime, timedelta
import functools
import io
from typing import Dict, Iterator, Optional, Tuple, Union
import zipfile
from collections import namedtuple
from datetime import date, datetime, timedelta
from typing import Iterator, Optional, Union, Tuple

import pandas as pd
import requests
Expand All @@ -13,32 +13,94 @@
DATE_FORMAT = "%Y-%m-%d"

# dictionary containing team abbreviations and their first year in existance
first_season_map = {'ALT': 1884, 'ANA': 1997, 'ARI': 1998, 'ATH': 1871,
'ATL': 1966, 'BAL': 1872, 'BLA': 1901, 'BLN': 1892,
'BLU': 1884, 'BOS': 1871, 'BRA': 1872, 'BRG': 1890,
'BRO': 1884, 'BSN': 1876, 'BTT': 1914, 'BUF': 1879,
'BWW': 1890, 'CAL': 1965, 'CEN': 1875, 'CHC': 1876,
'CHI': 1871, 'CHW': 1901, 'CIN': 1876, 'CKK': 1891,
'CLE': 1871, 'CLV': 1879, 'COL': 1883, 'COR': 1884,
'CPI': 1884, 'DET': 1901, 'DTN': 1881, 'ECK': 1872,
'FLA': 1993, 'HAR': 1874, 'HOU': 1962, 'IND': 1878,
'KCA': 1955, 'KCC': 1884, 'KCN': 1886, 'KCP': 1914,
'KCR': 1969, 'KEK': 1871, 'LAA': 1961, 'LAD': 1958,
'LOU': 1876, 'MAN': 1872, 'MAR': 1873, 'MIA': 2012,
'MIL': 1884, 'MIN': 1961, 'MLA': 1901, 'MLG': 1878,
'MLN': 1953, 'MON': 1969, 'NAT': 1872, 'NEW': 1915,
'NHV': 1875, 'NYG': 1883, 'NYI': 1890, 'NYM': 1962,
'NYP': 1883, 'NYU': 1871, 'NYY': 1903, 'OAK': 1968,
'OLY': 1871, 'PBB': 1890, 'PBS': 1914, 'PHA': 1882,
'PHI': 1873, 'PHK': 1884, 'PHQ': 1890, 'PIT': 1882,
'PRO': 1878, 'RES': 1873, 'RIC': 1884, 'ROC': 1890,
'ROK': 1871, 'SDP': 1969, 'SEA': 1977, 'SEP': 1969,
'SFG': 1958, 'SLB': 1902, 'SLM': 1884, 'SLR': 1875,
'STL': 1875, 'STP': 1884, 'SYR': 1879, 'TBD': 1998,
'TBR': 2008, 'TEX': 1972, 'TOL': 1884, 'TOR': 1977,
'TRO': 1871, 'WAS': 1873, 'WES': 1875, 'WHS': 1884,
'WIL': 1884, 'WOR': 1880, 'WSA': 1961, 'WSH': 1901,
'WSN': 2005}
# https://www.baseball-reference.com/teams/
# Nones mean that team only exists as an alias
first_season_map: Dict[str, Optional[int]] = {
'AB2': 1931, 'AB3': 1938, 'ABC': 1920, 'AC' : 1923, 'AG' : 1933, 'ALT': 1884, 'ANA': 1997, 'ARI': 1998,
'ATH': 1876, 'ATL': 1966, 'BAG': None, 'BAL': 1954, 'BBB': 1924, 'BBS': 1923, 'BCA': 1932, 'BE' : 1935,
'BEG': 1938, 'BFB': 1890, 'BFL': None, 'BLA': 1901, 'BLN': 1892, 'BLO': 1882, 'BLT': 1914, 'BLU': 1884,
'BOS': 1901, 'BR2': 1923, 'BRA': 1872, 'BRD': 1884, 'BRG': 1890, 'BRO': 1884, 'BRS': 1890, 'BSN': 1876,
'BTT': 1914, 'BUF': 1879, 'BWW': 1890, 'CAG': 1920, 'CAL': 1965, 'CBB': 1933, 'CBE': 1943, 'CBK': 1883,
'CBL': 1870, 'CBN': 1924, 'CBR': 1939, 'CC' : 1943, 'CCB': 1942, 'CCU': 1931, 'CEG': 1935, 'CEL': 1926,
'CEN': 1875, 'CG' : 1933, 'CHC': 1876, 'CHH': 1914, 'CHP': 1890, 'CHT': 1927, 'CHW': 1901, 'CIN': 1876,
'CKK': 1891, 'CL2': 1932, 'CLE': 1901, 'CLI': 1890, 'CLS': 1889, 'CLV': 1887, 'CNR': 1876, 'CNS': 1880,
'COB': 1921, 'COG': 1920, 'COL': 1883, 'COR': 1884, 'COT': 1932, 'CPI': 1884, 'CRS': 1934, 'CS' : 1921,
'CSE': 1923, 'CSW': 1920, 'CT' : 1937, 'CTG': 1928, 'CTS': 1922, 'CUP': 1932, 'DET': 1901, 'DM' : 1920,
'DS' : 1920, 'DTN': 1881, 'DTS': 1937, 'DW' : 1932, 'DYM': 1920, 'ECK': 1872, 'FLA': 1993, 'HAR': 1876,
'HBG': 1924, 'HG' : 1929, 'HIL': 1923, 'HOU': 1962, 'IA' : 1937, 'IAB': 1939, 'IBL': 1878, 'IC' : 1946,
'ID' : 1933, 'IHO': 1884, 'IND': 1887, 'JRC': 1938, 'KCA': 1955, 'KCC': 1888, 'KCM': 1920, 'KCN': 1886,
'KCP': 1914, 'KCR': 1969, 'KCU': 1884, 'KEK': 1871, 'LAA': 1961, 'LAD': 1958, 'LGR': 1876, 'LOU': 1882,
'LOW': 1931, 'LRG': 1932, 'LVB': 1930, 'MAN': 1872, 'MAR': 1873, 'MB' : 1923, 'MGS': 1932, 'MIA': 2012,
'MIL': 1884, 'MIN': 1961, 'MLA': 1891, 'MLG': 1878, 'MLN': 1953, 'MLU': 1884, 'MON': 1969, 'MRM': 1932,
'MRS': 1924, 'NAT': 1872, 'NBY': 1936, 'ND' : 1934, 'NE' : 1936, 'NEG': 1930, 'NEW': 1915, 'NHV': 1875,
'NLG': 1923, 'NS' : 1926, 'NWB': 1932, 'NYC': 1935, 'NYG': 1883, 'NYI': 1890, 'NYM': 1962, 'NYP': 1883,
'NYU': 1876, 'NYY': 1903, 'OAK': 1968, 'OLY': 1871, 'PBB': 1890, 'PBG': 1934, 'PBK': 1922, 'PBS': 1914,
'PC' : 1933, 'PHA': 1882, 'PHI': 1873, 'PHK': 1884, 'PHQ': 1890, 'PIT': 1882, 'PK' : 1922, 'PRO': 1878,
'PS' : 1934, 'PTG': 1928, 'RES': 1873, 'RIC': 1884, 'ROC': 1890, 'ROK': 1871, 'SBS': 1876, 'SDP': 1969,
'SEA': 1977, 'SEN': 1938, 'SEP': 1969, 'SFG': 1958, 'SL2': 1937, 'SL3': 1939, 'SLB': 1902, 'SLG': 1920,
'SLI': 1914, 'SLM': 1884, 'SLR': 1875, 'SLS': 1922, 'SNH': 1938, 'SNS': 1940, 'STL': 1875, 'STP': 1884,
'SYR': 1879, 'SYS': 1890, 'TBD': 1998, 'TBR': 2008, 'TC' : 1940, 'TC2': 1939, 'TEX': 1972, 'TLM': 1890,
'TOL': 1884, 'TOR': 1977, 'TRO': 1871, 'TRT': 1879, 'TT' : 1923, 'WAP': 1932, 'WAS': 1884, 'WEG': 1936,
'WES': 1875, 'WHS': 1892, 'WIL': 1884, 'WMP': 1925, 'WNA': 1884, 'WNL': 1886, 'WOR': 1880, 'WP' : 1924,
'WSA': 1961, 'WSH': 1901, 'WSN': 2005, 'WST': 1884,
}

team_equivalents = [
{'ANA', 'CAL', 'LAA'},
{'BSN', 'MLN', 'ATL'},
{'BLO', 'BLN', 'BLT', 'MLA', 'SLB', 'BAL'},
{'BRD', 'BRS', 'BOS'},
{'BRO', 'LAD'},
{'PHA', 'OAK'},
{'FLA', 'MIA'},
{'SEP', 'MIL'},
{'WSH', 'MIN'},
{'MON', 'WSN'},
{'NYG', 'SFG'},
{'TBD', 'TBR'},
{'BCA', 'IAB'},
{'AC' , 'BAG'},
{'BR2', 'BRG'},
{'NEG', 'CEG', 'WEG', 'BEG'},
{'CNS', 'CIN'},
{'CCB', 'CBE'},
{'CLE', 'CLV'},
{'CS' , 'CSW'},
{'AB2', 'ID' },
{'CC' , 'IC' },
{'JRC', 'CBR'},
{'LVB', 'LOW'},
{'BE' , 'NE' },
{'PC' , 'TC' , 'TC2'},
{'PBK', 'PK' },
{'SLG', 'SLS'},
# Potenital issue here as HAR is duplicated by BR for both
# Hartford Dark Blues (NL 1876-1878)
# Harrisburgh Stars (NNL 1943)
# These are two distinct teams, but with the same code in BR
{'AB3', 'SL3', 'SNS', 'HAR'},
{'WP' , 'WMP'},
{'WHS', 'WNA'},
{'WAS', 'WST'}
]

def get_first_season(team: str, include_equivalents: bool = True) -> Optional[int]:
if not include_equivalents:
return first_season_map[team]

oldest = first_season_map[team] or date.today().year

equivalents = [x for x in team_equivalents if team in x]

if not equivalents:
return oldest

for equivalent in equivalents[0]:
equivalent_first = first_season_map[equivalent]
if equivalent_first is not None and equivalent_first < oldest:
oldest = equivalent_first

return oldest

STATCAST_VALID_DATES = {
2008: (date(2008, 3, 25), date(2008, 10, 27)),
Expand Down Expand Up @@ -288,25 +350,26 @@ def flag_imputed_data(statcast_df: pd.DataFrame) -> pd.DataFrame:

def norm_pitch_code(pitch: str, to_word: bool = False) -> str:
normed = pitch_name_to_code_map.get(pitch.upper())
normed = pitch_code_to_name_map.get(normed) if to_word else normed
normed = pitch_code_to_name_map.get(normed) if to_word and normed else normed
if normed is None:
if pitch.lower() == 'all':
raise ValueError("'All' is not a valid pitch in this particular context!")
raise ValueError(f'{pitch} is not a valid pitch!')
return normed

def norm_positions(pos: Union[int, str], to_word: bool = False, to_number: bool = True) -> str:
pos = str(pos) if type(pos) == int else pos
if pos in pos_code_to_numbers_map.values():
pos_str = str(pos)
normed: Optional[str] = None
if pos_str in pos_code_to_numbers_map.values():
to_number = False
normed = pos
normed = pos_str
else:
normed = pos_name_to_code_map.get(pos.upper())
normed = pos_code_to_name_map.get(normed) if to_word else normed
normed = pos_name_to_code_map.get(pos_str.upper())
normed = pos_code_to_name_map.get(normed) if to_word and normed else normed
if to_number:
if normed not in ["IF", "OF"]:
normed = pos_code_to_numbers_map.get(normed)
if pos.lower() == "all":
normed = pos_code_to_numbers_map.get(normed) if normed else normed
if pos_str.lower() == "all":
normed = ""
if normed is None:
raise ValueError(f'{pos} is not a valid position!')
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/pybaseball/test_standings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get_division_counts_by_season(season: Optional[int]) -> int:

class TestBRefStandings:
@pytest.mark.parametrize(
"season", [(x) for x in range(1871, most_recent_season())]
"season", [(x) for x in range(1876, most_recent_season())]
)
def test_standings(self, season: Optional[int]) -> None:
standings_list = standings(season)
Expand All @@ -33,7 +33,7 @@ def test_standings(self, season: Optional[int]) -> None:
assert len(data.columns) > 0
assert len(data.index) > 0

def test_standings_pre_1871(self) -> None:
def test_standings_pre_1876(self) -> None:
season = 1870

with pytest.raises(ValueError):
Expand Down
29 changes: 23 additions & 6 deletions tests/integration/pybaseball/test_team_results.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
from typing import Optional

import pytest

from pybaseball.team_results import get_soup, schedule_and_record
from pybaseball.utils import first_season_map, most_recent_season
from pybaseball.utils import first_season_map, get_first_season, most_recent_season


missing_schedules_scores = {
'AB2', 'AB3', 'ABC', 'AC' , 'AG' , 'BBB', 'BBS', 'BCA', 'BE' , 'BEG', 'BFB', 'BLO', 'BLT', 'BR2', 'BRD', 'BRS',
'CAG', 'CBB', 'CBE', 'CBK', 'CBL', 'CBN', 'CBR', 'CC' , 'CCB', 'CCU', 'CEG', 'CEL', 'CG', 'CHH', 'CHI', 'CHP', 'CHT',
'CL2', 'CLI', 'CLS', 'CLV', 'CNR', 'CNS', 'COB', 'COG', 'COT', 'CRS', 'CS' , 'CSE', 'CSW', 'CT' , 'CTG', 'CTS', 'CUP',
'DM' , 'DS' , 'DTS', 'DW' , 'DYM', 'HBG', 'HG' , 'HIL', 'IA' , 'IAB', 'IBL', 'IC' , 'ID' , 'IHO', 'JRC', 'KCM', 'KCU',
'LGR', 'LOW', 'LRG', 'LVB', 'MB' , 'MGS', 'MLA', 'MLU', 'MRM', 'MRS', 'NBY', 'ND' , 'NE' , 'NEG', 'NLG', 'NS' , 'NWB',
'NYC', 'PBG', 'PBK', 'PC' , 'PK' , 'PS' , 'PTG', 'SBS', 'SEN', 'SL2', 'SL3', 'SLG', 'SLI', 'SLS', 'SNH', 'SNS',
'SYS', 'TC' , 'TC2', 'TLM', 'TRT', 'TT' , 'WAP', 'WEG', 'WMP', 'WNA', 'WNL', 'WP' , 'WST'
}

@pytest.mark.parametrize(
"season, team", [
(first_season_map[x], x) for x in first_season_map.keys()
(get_first_season(x, False), x) for x in first_season_map.keys()
]
)
def test_schedule_and_record(season: int, team: str) -> None:
def test_schedule_and_record(season: Optional[int], team: str) -> None:
if season is None or team in missing_schedules_scores:
return
result = schedule_and_record(season, team)

assert result is not None
Expand All @@ -28,12 +42,15 @@ def test_schedule_and_record(season: int, team: str) -> None:

@pytest.mark.parametrize(
"season, team", [
(first_season_map[x] - 1, x) for x in first_season_map.keys()
(get_first_season(x, False), x) for x in first_season_map.keys()
]
)
def test_schedule_and_record_bad_years(season: int, team: str) -> None:
def test_schedule_and_record_bad_years(season: Optional[int], team: str) -> None:
if season is None:
return

with pytest.raises(ValueError):
schedule_and_record(season, team)
schedule_and_record(season - 1, team)

def test_schedule_and_record_after_existence() -> None:
with pytest.raises(ValueError):
Expand Down
Loading

0 comments on commit 8a8e398

Please sign in to comment.