Adapt for BR now supporting Negro Leagues (#215)

jldbc · Jul 1, 2021 · 8a8e398 · 8a8e398
1 parent ca4d8ca
commit 8a8e398
Show file tree

Hide file tree

Showing 8 changed files with 3,066 additions and 3,033 deletions.
diff --git a/pybaseball/data/fangraphs_teams.csv b/pybaseball/data/fangraphs_teams.csv
diff --git a/pybaseball/standings.py b/pybaseball/standings.py
@@ -2,7 +2,7 @@
 
 import pandas as pd
 import requests
-from bs4 import BeautifulSoup, Comment
+from bs4 import BeautifulSoup, Comment, PageElement, ResultSet
 
 from . import cache
 from .utils import most_recent_season
@@ -16,22 +16,22 @@ def get_soup(year: int) -> BeautifulSoup:
 def get_tables(soup: BeautifulSoup, season: int) -> List[pd.DataFrame]:
     datasets = []
     if season >= 1969:
-        tables = soup.find_all('table')
+        tables: List[PageElement] = soup.find_all('table')
         if season == 1981:
             # For some reason BRef has 1981 broken down by halves and overall
             # https://www.baseball-reference.com/leagues/MLB/1981-standings.shtml
             tables = [x for x in tables if 'overall' in x.get('id', '')]
         for table in tables:
             data = []
-            headings = [th.get_text() for th in table.find("tr").find_all("th")]
+            headings: List[PageElement] = [th.get_text() for th in table.find("tr").find_all("th")]
             data.append(headings)
-            table_body = table.find('tbody')
-            rows = table_body.find_all('tr')
+            table_body: PageElement = table.find('tbody')
+            rows: List[PageElement] = table_body.find_all('tr')
             for row in rows:
-                cols = row.find_all('td')
-                cols = [ele.text.strip() for ele in cols]
-                cols.insert(0,row.find_all('a')[0]['title']) # team name
-                data.append([ele for ele in cols if ele])
+                cols: List[PageElement] = row.find_all('td')
+                cols_text: List[str] = [ele.text.strip() for ele in cols]
+                cols_text.insert(0, row.find_all('a')[0].text.strip()) # team name
+                data.append([ele for ele in cols_text if ele])
             datasets.append(data)
     else:
         data = []
@@ -64,7 +64,7 @@ def get_tables(soup: BeautifulSoup, season: int) -> List[pd.DataFrame]:
                 for _ in range(16):
                     cols.pop()
             cols = [ele.text.strip() for ele in cols]
-            cols.insert(0,row.find_all('a')[0]['title']) # team name
+            cols.insert(0,row.find_all('a')[0].text.strip()) # team name
             data.append([ele for ele in cols if ele])
         datasets.append(data)
     #convert list-of-lists to dataframes
@@ -78,10 +78,10 @@ def standings(season:Optional[int] = None) -> pd.DataFrame:
     # get most recent standings if date not specified
     if season is None:
         season = most_recent_season()
-    if season < 1871:
+    if season < 1876:
         raise ValueError(
-            "This query currently only returns standings until the 1871 season. "
-            "Try looking at years from 1871 to present."
+            "This query currently only returns standings until the 1876 season. "
+            "Try looking at years from 1876 to present."
         )
 
     # retrieve html from baseball reference

diff --git a/pybaseball/team_results.py b/pybaseball/team_results.py
@@ -1,25 +1,27 @@
 from datetime import datetime
+from typing import Optional
 
+from bs4 import BeautifulSoup
 import numpy as np
 import pandas as pd
 import requests
-from bs4 import BeautifulSoup
 
-from pybaseball.utils import first_season_map, most_recent_season
+from pybaseball.utils import get_first_season, most_recent_season
 
 from . import cache
 
 # TODO: retrieve data for all teams? a full season's worth of results
 
-def get_soup(season, team):
+def get_soup(season: Optional[int], team: str) -> BeautifulSoup:
     # get most recent year's schedule if year not specified
     if season is None:
         season = most_recent_season()
     url = "http://www.baseball-reference.com/teams/{}/{}-schedule-scores.shtml".format(team, season)
-    s=requests.get(url).content
+    print(url)
+    s = requests.get(url).content
     return BeautifulSoup(s, "lxml")
 
-def get_table(soup,team):
+def get_table(soup: BeautifulSoup, team: str) -> pd.DataFrame:
     try:
         table = soup.find_all('table')[0]
     except:
@@ -64,14 +66,14 @@ def get_table(soup,team):
                 cols = [ele.text.strip() for ele in cols][0:5]
                 data.append([ele for ele in cols if ele])
     #convert to pandas dataframe. make first row the table's column names and reindex.
-    data = pd.DataFrame(data)
-    data = data.rename(columns=data.iloc[0])
-    data = data.reindex(data.index.drop(0))
-    data = data.drop('',1) #not a useful column
-    data['Attendance'].replace(r'^Unknown$', np.nan, regex=True, inplace = True) # make this a NaN so the column can benumeric
-    return data
+    df = pd.DataFrame(data)
+    df = df.rename(columns=df.iloc[0])
+    df = df.reindex(df.index.drop(0))
+    df = df.drop('',1) #not a useful column
+    df['Attendance'].replace(r'^Unknown$', np.nan, regex=True, inplace = True) # make this a NaN so the column can benumeric
+    return df
 
-def process_win_streak(data):
+def process_win_streak(data: pd.DataFrame) -> pd.DataFrame:
     """
     Convert "+++"/"---" formatted win/loss streak column into a +/- integer column
     """
@@ -83,7 +85,7 @@ def process_win_streak(data):
         data = data.drop('Streak2',1)
     return data
 
-def make_numeric(data):
+def make_numeric(data: pd.DataFrame) -> pd.DataFrame:
     # first remove commas from attendance values
     # skip if column is all NA (not sure if everyone kept records in the early days)
     if data['Attendance'].count() > 0:
@@ -98,12 +100,13 @@ def make_numeric(data):
     return data
 
 @cache.df_cache()
-def schedule_and_record(season=None, team=None):
+def schedule_and_record(season: int, team: str) -> pd.DataFrame:
     # retrieve html from baseball reference
     # sanatize input
     team = team.upper()
     try:
-        if season < first_season_map[team]:
+        first_season = get_first_season(team)
+        if first_season is None or season < first_season:
             m = "Season cannot be before first year of a team's existence"
             raise ValueError(m)
     # ignore validation if team isn't found in dictionary

diff --git a/pybaseball/teamid_lookup.py b/pybaseball/teamid_lookup.py
@@ -103,13 +103,13 @@ def _generate_teams() -> pd.DataFrame:
     Should only need to be run when a team is added, removed, or moves to a new city.
     """
 
-    start_season = 1871
+    start_season = 1876
     end_season = most_recent_season()
 
     lahman_columns = ['yearID', 'lgID', 'teamID', 'franchID', 'divID', 'name', 'teamIDBR', 'teamIDlahman45',
                       'teamIDretro']
 
-    lahman_teams = lahman.teams()[lahman_columns]
+    lahman_teams = lahman.teams().query('yearID >= @start_season')[lahman_columns]
 
     # Only getting AB to make payload small, and you have to specify at least one column
     fg_team_data = fangraphs.fg_team_batting_data(start_season, end_season, "ALL", stat_columns=['AB'])

diff --git a/pybaseball/utils.py b/pybaseball/utils.py
@@ -1,9 +1,9 @@
+from collections import namedtuple
+from datetime import date, datetime, timedelta
 import functools
 import io
+from typing import Dict, Iterator, Optional, Tuple, Union
 import zipfile
-from collections import namedtuple
-from datetime import date, datetime, timedelta
-from typing import Iterator, Optional, Union, Tuple
 
 import pandas as pd
 import requests
@@ -13,32 +13,94 @@
 DATE_FORMAT = "%Y-%m-%d"
 
 # dictionary containing team abbreviations and their first year in existance
-first_season_map = {'ALT': 1884, 'ANA': 1997, 'ARI': 1998, 'ATH': 1871,
-					'ATL': 1966, 'BAL': 1872, 'BLA': 1901, 'BLN': 1892,
-					'BLU': 1884, 'BOS': 1871, 'BRA': 1872, 'BRG': 1890,
-					'BRO': 1884, 'BSN': 1876, 'BTT': 1914, 'BUF': 1879,
-					'BWW': 1890, 'CAL': 1965, 'CEN': 1875, 'CHC': 1876,
-					'CHI': 1871, 'CHW': 1901, 'CIN': 1876, 'CKK': 1891,
-					'CLE': 1871, 'CLV': 1879, 'COL': 1883, 'COR': 1884,
-					'CPI': 1884, 'DET': 1901, 'DTN': 1881, 'ECK': 1872,
-					'FLA': 1993, 'HAR': 1874, 'HOU': 1962, 'IND': 1878,
-					'KCA': 1955, 'KCC': 1884, 'KCN': 1886, 'KCP': 1914,
-					'KCR': 1969, 'KEK': 1871, 'LAA': 1961, 'LAD': 1958,
-					'LOU': 1876, 'MAN': 1872, 'MAR': 1873, 'MIA': 2012,
-					'MIL': 1884, 'MIN': 1961, 'MLA': 1901, 'MLG': 1878,
-					'MLN': 1953, 'MON': 1969, 'NAT': 1872, 'NEW': 1915,
-					'NHV': 1875, 'NYG': 1883, 'NYI': 1890, 'NYM': 1962,
-					'NYP': 1883, 'NYU': 1871, 'NYY': 1903, 'OAK': 1968,
-					'OLY': 1871, 'PBB': 1890, 'PBS': 1914, 'PHA': 1882,
-					'PHI': 1873, 'PHK': 1884, 'PHQ': 1890, 'PIT': 1882,
-					'PRO': 1878, 'RES': 1873, 'RIC': 1884, 'ROC': 1890,
-					'ROK': 1871, 'SDP': 1969, 'SEA': 1977, 'SEP': 1969,
-					'SFG': 1958, 'SLB': 1902, 'SLM': 1884, 'SLR': 1875,
-					'STL': 1875, 'STP': 1884, 'SYR': 1879, 'TBD': 1998,
-					'TBR': 2008, 'TEX': 1972, 'TOL': 1884, 'TOR': 1977,
-					'TRO': 1871, 'WAS': 1873, 'WES': 1875, 'WHS': 1884,
-					'WIL': 1884, 'WOR': 1880, 'WSA': 1961, 'WSH': 1901,
-					'WSN': 2005}
+# https://www.baseball-reference.com/teams/
+# Nones mean that team only exists as an alias
+first_season_map: Dict[str, Optional[int]] = {
+    'AB2': 1931, 'AB3': 1938, 'ABC': 1920, 'AC' : 1923, 'AG' : 1933, 'ALT': 1884, 'ANA': 1997, 'ARI': 1998,
+    'ATH': 1876, 'ATL': 1966, 'BAG': None, 'BAL': 1954, 'BBB': 1924, 'BBS': 1923, 'BCA': 1932, 'BE' : 1935,
+    'BEG': 1938, 'BFB': 1890, 'BFL': None, 'BLA': 1901, 'BLN': 1892, 'BLO': 1882, 'BLT': 1914, 'BLU': 1884,
+    'BOS': 1901, 'BR2': 1923, 'BRA': 1872, 'BRD': 1884, 'BRG': 1890, 'BRO': 1884, 'BRS': 1890, 'BSN': 1876,
+    'BTT': 1914, 'BUF': 1879, 'BWW': 1890, 'CAG': 1920, 'CAL': 1965, 'CBB': 1933, 'CBE': 1943, 'CBK': 1883,
+    'CBL': 1870, 'CBN': 1924, 'CBR': 1939, 'CC' : 1943, 'CCB': 1942, 'CCU': 1931, 'CEG': 1935, 'CEL': 1926,
+    'CEN': 1875, 'CG' : 1933, 'CHC': 1876, 'CHH': 1914, 'CHP': 1890, 'CHT': 1927, 'CHW': 1901, 'CIN': 1876,
+    'CKK': 1891, 'CL2': 1932, 'CLE': 1901, 'CLI': 1890, 'CLS': 1889, 'CLV': 1887, 'CNR': 1876, 'CNS': 1880,
+    'COB': 1921, 'COG': 1920, 'COL': 1883, 'COR': 1884, 'COT': 1932, 'CPI': 1884, 'CRS': 1934, 'CS' : 1921,
+    'CSE': 1923, 'CSW': 1920, 'CT' : 1937, 'CTG': 1928, 'CTS': 1922, 'CUP': 1932, 'DET': 1901, 'DM' : 1920,
+    'DS' : 1920, 'DTN': 1881, 'DTS': 1937, 'DW' : 1932, 'DYM': 1920, 'ECK': 1872, 'FLA': 1993, 'HAR': 1876,
+    'HBG': 1924, 'HG' : 1929, 'HIL': 1923, 'HOU': 1962, 'IA' : 1937, 'IAB': 1939, 'IBL': 1878, 'IC' : 1946,
+    'ID' : 1933, 'IHO': 1884, 'IND': 1887, 'JRC': 1938, 'KCA': 1955, 'KCC': 1888, 'KCM': 1920, 'KCN': 1886,
+    'KCP': 1914, 'KCR': 1969, 'KCU': 1884, 'KEK': 1871, 'LAA': 1961, 'LAD': 1958, 'LGR': 1876, 'LOU': 1882,
+    'LOW': 1931, 'LRG': 1932, 'LVB': 1930, 'MAN': 1872, 'MAR': 1873, 'MB' : 1923, 'MGS': 1932, 'MIA': 2012,
+    'MIL': 1884, 'MIN': 1961, 'MLA': 1891, 'MLG': 1878, 'MLN': 1953, 'MLU': 1884, 'MON': 1969, 'MRM': 1932,
+    'MRS': 1924, 'NAT': 1872, 'NBY': 1936, 'ND' : 1934, 'NE' : 1936, 'NEG': 1930, 'NEW': 1915, 'NHV': 1875,
+    'NLG': 1923, 'NS' : 1926, 'NWB': 1932, 'NYC': 1935, 'NYG': 1883, 'NYI': 1890, 'NYM': 1962, 'NYP': 1883,
+    'NYU': 1876, 'NYY': 1903, 'OAK': 1968, 'OLY': 1871, 'PBB': 1890, 'PBG': 1934, 'PBK': 1922, 'PBS': 1914,
+    'PC' : 1933, 'PHA': 1882, 'PHI': 1873, 'PHK': 1884, 'PHQ': 1890, 'PIT': 1882, 'PK' : 1922, 'PRO': 1878,
+    'PS' : 1934, 'PTG': 1928, 'RES': 1873, 'RIC': 1884, 'ROC': 1890, 'ROK': 1871, 'SBS': 1876, 'SDP': 1969,
+    'SEA': 1977, 'SEN': 1938, 'SEP': 1969, 'SFG': 1958, 'SL2': 1937, 'SL3': 1939, 'SLB': 1902, 'SLG': 1920,
+    'SLI': 1914, 'SLM': 1884, 'SLR': 1875, 'SLS': 1922, 'SNH': 1938, 'SNS': 1940, 'STL': 1875, 'STP': 1884,
+    'SYR': 1879, 'SYS': 1890, 'TBD': 1998, 'TBR': 2008, 'TC' : 1940, 'TC2': 1939, 'TEX': 1972, 'TLM': 1890,
+    'TOL': 1884, 'TOR': 1977, 'TRO': 1871, 'TRT': 1879, 'TT' : 1923, 'WAP': 1932, 'WAS': 1884, 'WEG': 1936,
+    'WES': 1875, 'WHS': 1892, 'WIL': 1884, 'WMP': 1925, 'WNA': 1884, 'WNL': 1886, 'WOR': 1880, 'WP' : 1924,
+    'WSA': 1961, 'WSH': 1901, 'WSN': 2005, 'WST': 1884, 
+}
+
+team_equivalents = [
+    {'ANA', 'CAL', 'LAA'},
+    {'BSN', 'MLN', 'ATL'},
+    {'BLO', 'BLN', 'BLT', 'MLA', 'SLB', 'BAL'},
+    {'BRD', 'BRS', 'BOS'},
+    {'BRO', 'LAD'},
+    {'PHA', 'OAK'},
+    {'FLA', 'MIA'},
+    {'SEP', 'MIL'},
+    {'WSH', 'MIN'},
+    {'MON', 'WSN'},
+    {'NYG', 'SFG'},
+    {'TBD', 'TBR'},
+    {'BCA', 'IAB'},
+    {'AC' , 'BAG'},
+    {'BR2', 'BRG'},
+    {'NEG', 'CEG', 'WEG', 'BEG'},
+    {'CNS', 'CIN'},
+    {'CCB', 'CBE'},
+    {'CLE', 'CLV'},
+    {'CS' , 'CSW'},
+    {'AB2', 'ID' },
+    {'CC' , 'IC' },
+    {'JRC', 'CBR'},
+    {'LVB', 'LOW'},
+    {'BE' , 'NE' },
+    {'PC' , 'TC' , 'TC2'},
+    {'PBK', 'PK' },
+    {'SLG', 'SLS'},
+    # Potenital issue here as HAR is duplicated by BR for both
+    # Hartford Dark Blues (NL 1876-1878)
+    # Harrisburgh Stars (NNL 1943)
+    # These are two distinct teams, but with the same code in BR
+    {'AB3', 'SL3', 'SNS', 'HAR'},
+    {'WP' , 'WMP'},
+    {'WHS', 'WNA'},
+    {'WAS', 'WST'}
+]
+
+def get_first_season(team: str, include_equivalents: bool = True) -> Optional[int]:
+    if not include_equivalents:
+        return first_season_map[team]
+
+    oldest = first_season_map[team] or date.today().year
+
+    equivalents = [x for x in team_equivalents if team in x]
+
+    if not equivalents:
+        return oldest
+
+    for equivalent in equivalents[0]:
+        equivalent_first = first_season_map[equivalent]
+        if equivalent_first is not None and equivalent_first < oldest:
+            oldest = equivalent_first
+
+    return oldest
 
 STATCAST_VALID_DATES = {
 	2008: (date(2008, 3, 25), date(2008, 10, 27)),
@@ -288,25 +350,26 @@ def flag_imputed_data(statcast_df: pd.DataFrame) -> pd.DataFrame:
 
 def norm_pitch_code(pitch: str, to_word: bool = False) -> str:
 	normed = pitch_name_to_code_map.get(pitch.upper())
-	normed = pitch_code_to_name_map.get(normed) if to_word else normed
+	normed = pitch_code_to_name_map.get(normed) if to_word and normed else normed
 	if normed is None:
 		if pitch.lower() == 'all':
 			raise ValueError("'All' is not a valid pitch in this particular context!")
 		raise ValueError(f'{pitch} is not a valid pitch!')
 	return normed
 
 def norm_positions(pos: Union[int, str], to_word: bool = False, to_number: bool = True) -> str:
-	pos = str(pos) if type(pos) == int else pos
-	if pos in pos_code_to_numbers_map.values():
+	pos_str = str(pos)
+	normed: Optional[str] = None
+	if pos_str in pos_code_to_numbers_map.values():
 		to_number = False
-		normed = pos
+		normed = pos_str
 	else:
-		normed = pos_name_to_code_map.get(pos.upper())
-		normed = pos_code_to_name_map.get(normed) if to_word else normed
+		normed = pos_name_to_code_map.get(pos_str.upper())
+		normed = pos_code_to_name_map.get(normed) if to_word and normed else normed
 	if to_number:
 		if normed not in ["IF", "OF"]:
-			normed = pos_code_to_numbers_map.get(normed)
-		if pos.lower() == "all":
+			normed = pos_code_to_numbers_map.get(normed) if normed else normed
+		if pos_str.lower() == "all":
 			normed = ""
 	if normed is None:
 		raise ValueError(f'{pos} is not a valid position!')

diff --git a/tests/integration/pybaseball/test_standings.py b/tests/integration/pybaseball/test_standings.py
@@ -19,7 +19,7 @@ def get_division_counts_by_season(season: Optional[int]) -> int:
 
 class TestBRefStandings:
     @pytest.mark.parametrize(
-        "season", [(x) for x in range(1871, most_recent_season())]
+        "season", [(x) for x in range(1876, most_recent_season())]
     )
     def test_standings(self, season: Optional[int]) -> None:
         standings_list = standings(season)
@@ -33,7 +33,7 @@ def test_standings(self, season: Optional[int]) -> None:
             assert len(data.columns) > 0
             assert len(data.index) > 0
 
-    def test_standings_pre_1871(self) -> None:
+    def test_standings_pre_1876(self) -> None:
         season = 1870
 
         with pytest.raises(ValueError):

diff --git a/tests/integration/pybaseball/test_team_results.py b/tests/integration/pybaseball/test_team_results.py
@@ -1,15 +1,29 @@
+from typing import Optional
+
 import pytest
 
 from pybaseball.team_results import get_soup, schedule_and_record
-from pybaseball.utils import first_season_map, most_recent_season
+from pybaseball.utils import first_season_map, get_first_season, most_recent_season
+
 
+missing_schedules_scores = {
+    'AB2', 'AB3', 'ABC', 'AC' , 'AG' , 'BBB', 'BBS', 'BCA', 'BE' , 'BEG', 'BFB', 'BLO', 'BLT', 'BR2', 'BRD', 'BRS',
+    'CAG', 'CBB', 'CBE', 'CBK', 'CBL', 'CBN', 'CBR', 'CC' , 'CCB', 'CCU', 'CEG', 'CEL', 'CG',  'CHH', 'CHI', 'CHP', 'CHT',
+    'CL2', 'CLI', 'CLS', 'CLV', 'CNR', 'CNS', 'COB', 'COG', 'COT', 'CRS', 'CS' , 'CSE', 'CSW', 'CT' , 'CTG', 'CTS', 'CUP',
+    'DM' , 'DS' , 'DTS', 'DW' , 'DYM', 'HBG', 'HG' , 'HIL', 'IA' , 'IAB', 'IBL', 'IC' , 'ID' , 'IHO', 'JRC', 'KCM', 'KCU',
+    'LGR', 'LOW', 'LRG', 'LVB', 'MB' , 'MGS', 'MLA', 'MLU', 'MRM', 'MRS', 'NBY', 'ND' , 'NE' , 'NEG', 'NLG', 'NS' , 'NWB',
+    'NYC', 'PBG', 'PBK', 'PC' , 'PK' , 'PS' , 'PTG', 'SBS', 'SEN', 'SL2', 'SL3', 'SLG', 'SLI', 'SLS', 'SNH', 'SNS',
+    'SYS', 'TC' , 'TC2', 'TLM', 'TRT', 'TT' , 'WAP', 'WEG', 'WMP', 'WNA', 'WNL', 'WP' , 'WST'
+}
 
 @pytest.mark.parametrize(
     "season, team", [
-        (first_season_map[x], x) for x in first_season_map.keys()
+        (get_first_season(x, False), x) for x in first_season_map.keys()
     ]
 )
-def test_schedule_and_record(season: int, team: str) -> None:
+def test_schedule_and_record(season: Optional[int], team: str) -> None:
+    if season is None or team in missing_schedules_scores:
+        return
     result = schedule_and_record(season, team)
 
     assert result is not None
@@ -28,12 +42,15 @@ def test_schedule_and_record(season: int, team: str) -> None:
 
 @pytest.mark.parametrize(
     "season, team", [
-        (first_season_map[x] - 1, x) for x in first_season_map.keys()
+        (get_first_season(x, False), x) for x in first_season_map.keys()
     ]
 )
-def test_schedule_and_record_bad_years(season: int, team: str) -> None:
+def test_schedule_and_record_bad_years(season: Optional[int], team: str) -> None:
+    if season is None:
+        return
+
     with pytest.raises(ValueError):
-        schedule_and_record(season, team)
+        schedule_and_record(season - 1, team)
 
 def test_schedule_and_record_after_existence() -> None:
     with pytest.raises(ValueError):