-
Notifications
You must be signed in to change notification settings - Fork 0
/
AppMetadata.py
178 lines (155 loc) · 6.46 KB
/
AppMetadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import asyncio
import re
from asyncio import Task
from typing import Union, TypedDict, Set, List
from urllib import request
import bs4
from aiohttp import ClientSession
from aiohttp_socks import ProxyConnector
from bs4 import BeautifulSoup
from src import constants
from src.utils.proxy import ALL_PROXY
from src.utils.url_util import clean_store_url
class AppMetadata:
# _DEFAULT_TYPE_PRIORITY = ['image/webp', 'image/png', 'image/jpeg']
_RETRY_HTML = 3
class Metadata(TypedDict):
store_url: str
store_region: str
icon_base_url: str
original_type: str
types: Set[str]
resolutions: List[int]
app_name: str
app_version: str
def __init__(self, app_store_url: str, html_page: str = None):
self.store_url: str
self.store_region: str
self._html_task: Union[Task, None] = None
self._soup: Union[BeautifulSoup, None] = None
store_url, store_region = clean_store_url(app_store_url)
self._metadata: AppMetadata.Metadata = {
'store_url': store_url,
'store_region': store_region,
'icon_base_url': '',
'original_type': '',
'types': set(),
'resolutions': [],
'app_name': '',
'app_version': '',
}
if not html_page:
# create async task to get web page html
self._html_task = asyncio.create_task(self._get_page_html())
else:
# parse icon metadata in-place
self._soup = BeautifulSoup(html_page, 'html.parser')
self._parse_metadata()
async def _get_page_html(self) -> None:
"""
Get web page html
This is optionally called by __init__
"""
for _ in range(AppMetadata._RETRY_HTML):
async with ClientSession(
connector=ProxyConnector.from_url(ALL_PROXY) if ALL_PROXY else None
) as session:
async with session.get(self._metadata['store_url']) as resp:
if resp.status == 200:
self._soup = BeautifulSoup(await resp.text(), 'html.parser')
self._parse_metadata()
return
elif resp.status == 404:
# Note: in this case all metadata will be the default value
print(f"{self._metadata['store_url']} 404")
return
# else retry
raise Exception(f"{self._metadata['store_url']} html bad status code: {resp.status}")
async def await_html_task(self):
if self._html_task:
await self._html_task
def _parse_metadata(self):
"""
Populate all data from html_response
"""
self._parse_metadata_icon()
self._parse_metadata_app_name_version()
def _parse_metadata_icon(self) -> Metadata:
"""
Parse icon metadata from html_response
Get icon_base_url, types('webp', 'png', 'jpeg'), and resolutions(123w)
:return:
"""
tag_picture = self._soup.find('picture', attrs={
'id': re.compile(r'ember\d+'),
'class': 'we-artwork',
}) # The very first result is the app icon itself
try: # TODO possibly html document is not fetched
img_sources = list(filter(lambda x: type(x) == bs4.Tag and x.name == 'source', tag_picture.contents))
except AttributeError as e:
print(f"{self.store_url} icon not found")
raise e
img_sources_flatten = sum([[x.split(' ') for x in each_source.attrs.get('srcset').split(', ')]
for each_source in img_sources], [])
one_url = img_sources_flatten[0][0]
# final return
types = {each_source.attrs.get('type')[6:] for each_source in img_sources}
resolutions = sorted({int(r[:-1]) for (_, r) in img_sources_flatten})
img_base_url = one_url[:len(one_url) - one_url[::-1].index('/') - 1]
original_type = img_base_url[len(img_base_url) - img_base_url[::-1].index('.'):]
self._metadata.update({
'icon_base_url': img_base_url,
'original_type': original_type,
'types': types,
'resolutions': resolutions,
})
return self._metadata
def _parse_metadata_app_name_version(self) -> Metadata:
title = self._soup.find(
'h1', attrs={'class': re.compile(r'(product|app)-header__title')}
).next_element.get_text().strip()
version = self._soup.find(
'p', attrs={'class': 'whats-new__latest__version'}
) # If not found will be None
# example: https://apps.apple.com/cn/app/id1590820002
if version:
version = version.next_element.get_text().strip().split(' ')[-1]
else:
version = 'unknown'
self._metadata.update({
'app_name': title,
'app_version': version,
})
return self._metadata
def get_metadata(self):
return self._metadata.copy()
def get_url(self, type_: str = None, resolution: Union[int, str] = None) -> str:
"""
Get icon url
:param type_: 'webp', 'png', 'jpeg'
:param resolution: any resolution in metadata or 'max'
:return: direct url to the image at designated resolution OR **empty string** if page 404
"""
if not self._metadata['icon_base_url']:
return '' # fail case
if not type_:
type_ = self._metadata['original_type']
if not resolution and self._metadata['resolutions']:
resolution = self._metadata['resolutions'][-1]
elif resolution == 'max':
resolution = Constants.IMAGE_SIZE_CEIL
else:
assert resolution.isdigit()
return f"{self._metadata['icon_base_url']}/{resolution}x0w.{type_}"
def get_bin(self, type_: str = None, resolution: Union[int, str] = None):
icon_url = self.get_url(type_, resolution)
with request.urlopen(icon_url) as resp:
return resp.read()
async def get_bin_async(self, type_: str = None, resolution: Union[int, str] = None):
icon_url = self.get_url(type_, resolution)
async with ClientSession(
connector=ProxyConnector.from_url(ALL_PROXY) if ALL_PROXY else None
) as session:
async with session.get(icon_url) as resp:
assert resp.status == 200
return await resp.read()