-
-
Notifications
You must be signed in to change notification settings - Fork 56
/
Copy pathtransfermarkt-scraper.py
188 lines (145 loc) · 5.22 KB
/
transfermarkt-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
Acquire raw data from transfermark website using transfermark-scraper.
usage: 1_acquire.py [-h] {local,cloud} ...
positional arguments:
{local,cloud}
local Run the acquiring step locally
cloud Run the acquiring step in the cloud
optional arguments:
-h, --help show this help message and exit
"""
import os
import pathlib
import argparse
import sys
from typing import List
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
from transfermarkt_datasets.core.utils import (
read_config,
submit_batch_job_and_wait,
seasons_list
)
import logging
acquire_config = read_config()["acquire"]
logging.config.dictConfig(
acquire_config["logging"]
)
def scrapy_config() -> Settings:
"""Instantiate scrappy settings for the acquiring run.
Returns:
Settings: The default scrapy settings instance + overrides in config.yml
"""
# https://github.com/scrapy/scrapy/blob/master/scrapy/utils/project.py#L61
default_settings = get_project_settings()
overrides = acquire_config["scrapy_config"]
default_settings.setdict(overrides)
return default_settings
class Asset():
"""A wrapper for the asset to be acquired.
It contains some useful methods for manipulating the assets, such as path and parents rendering.
"""
asset_parents = {
'competitions' : None,
'games': 'competitions',
'clubs': 'competitions',
'players': 'clubs',
'appearances': 'players',
'game_lineups': 'games'
}
def __init__(self, name) -> None:
self.name = name
self.parent = None
def set_parent(self):
"""Get the parent of this asset as a new Asset"""
self.parent = Asset(self.asset_parents[self.name])
def file_path(self, season):
if self.name == 'competitions':
return pathlib.Path(f"data/competitions.json")
else:
return pathlib.Path(f"data/raw/transfermarkt-scraper/{season}/{self.name}.json.gz")
def file_full_path(self, season):
return str(self.file_path(season).absolute())
@classmethod
def all(self):
"""Get an ordered list of assets to be acquired.
Asset acquisition have dependecies between each other. This list returns the right order for asset
acquisition steps to run.
"""
assets = [Asset(name) for name in self.asset_parents if name != 'competitions']
for asset in assets:
asset.set_parent()
return assets
def acquire_on_local(asset, seasons):
def assets_list(assets: str) -> List[Asset]:
"""Generate the ordered list of Assets to be scraped based on the provided string.
Args:
assets (str): A string representing the assets to be scraped.
Returns:
List[Asset]: The ordered list of assets to be scraped.
"""
if asset == 'all':
assets = Asset.all()
else:
asset_obj = Asset(name=asset)
asset_obj.set_parent()
assets = [asset_obj]
return assets
def issue_crawlers_and_wait(assets, seasons, settings):
"""Create and submit scrapy crawlers to the reactor, and block until they've completed.
Args:
assets (List[Asset]): List of assets to be scraped.
seasons (List[int]): List of season to be scraped.
settings (dict): Crawler setting.
"""
# https://docs.scrapy.org/en/latest/topics/practices.html#running-multiple-spiders-in-the-same-process
runner = CrawlerRunner(settings)
# https://twistedmatrix.com/documents/13.2.0/api/twisted.internet.defer.inlineCallbacks.html
@defer.inlineCallbacks
def crawl():
for season in seasons:
# if there's no path created yet for this season create one
season_path = pathlib.Path(f"data/raw/transfermarkt-scraper/{season}")
if not season_path.exists():
season_path.mkdir(parents=True)
for asset_obj in assets:
# TODO: ideally, let transfermark-scraper handle destination file truncation via a setting instead of doing it here
# checkout https://foroayuda.es/scrapy-sobrescribe-los-archivos-json-en-lugar-de-agregar-el-archivo/
file_path = asset_obj.file_path(season)
if file_path.exists():
os.remove(str(file_path))
logging.info(
f"Schedule {asset_obj.name} for season {season}"
)
yield runner.crawl(
asset_obj.name,
parents=asset_obj.parent.file_full_path(season),
season=season
)
reactor.stop()
crawl()
reactor.run()
# get seasons and assets list
expanded_seasons = seasons_list(seasons)
expanded_assets = assets_list(asset)
# define crawler settings
settings = scrapy_config()
# create crawlers and wait until they complete
issue_crawlers_and_wait(expanded_assets, expanded_seasons, settings)
parser = argparse.ArgumentParser()
parser.add_argument(
'--asset',
help="Name of the asset to be acquired",
choices=['clubs', 'players', 'games', 'game_lineups', 'appearances', 'all'],
required=True
)
parser.add_argument(
'--seasons',
help="Season to be acquired. This is passed to the scraper as the SEASON argument",
default="2024",
type=str
)
arguments = parser.parse_args()
acquire_on_local(**vars(arguments))