-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrappy.py
180 lines (133 loc) · 6.24 KB
/
scrappy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import requests as rq
import bs4 as bs4
import random
import prettytable as pt
class Product:
"""
Provides functions for creating product specific search urls.
"""
def __init__(self,product_name):
self.product_name = str(product_name).replace(" ","+")
def amazon_url(self):
"""
Creates amazon url using product name and base url and returns it.
"""
amazon_url = f"https://www.amazon.in/s?k={self.product_name}"
return amazon_url
def flipkart_url(self):
"""
Creates flipkart url using product name and base url and returns it.
"""
flipkart_url = f"https://www.flipkart.com/search?q={self.product_name}&marketplace=FLIPKART"
return flipkart_url
def product_urls(self):
"""
returns an dictionary of product urls.
"""
urls = {"amazon": self.amazon_url(),"flipkart" : self.flipkart_url()}
return urls
class Request:
"""
Provides functions for making request and processing response into useful fields.
"""
def __init__(self,Product):
self.custom_headers_list = [{'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0',"accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"},
{'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',"accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"},
{'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',"accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"},
]
self.urls = Product.product_urls()
self.htmls = self.get_htmls()
def make_request(self):
"""
Creates an get request to the product urls and returns the response content along with the status code as an response dictionary.
"""
stat_codes = {}
resp = {}
for url in self.urls:
req = rq.get(self.urls[url],headers = self.custom_headers_list[random.randint(0,2)])
resp[url] = req.content
stat_codes[url] = req.status_code
response = {"status" : stat_codes,"response" : resp}
return response
def get_htmls(self):
"""
Creates a html object using the response content and returns it.
"""
self.htmls = {}
response = self.make_request()["response"]
for pf in response:
html = bs4.BeautifulSoup(response[pf],"lxml")
self.htmls[pf] = html
return self.htmls
def clean_html_tags(self,obj):
"""Removes Html tags and returns the output as string"""
for i in range(len(obj)):
obj[i] = obj[i].string
def get_names(self):
"""
Extract the Specified fields using custom css properties and returns the extracted product names as a dictionary of product list.
"""
names = {}
for html in self.htmls:
name = []
if (html == 'amazon'):
name = self.htmls[html].select('div.puisg-col-inner span.a-size-medium.a-color-base.a-text-normal')
name.extend(self.htmls[html].select('div.puisg-col-inner span.a-size-base-plus.a-color-base.a-text-normal'))
self.clean_html_tags(name)
# strips extended lines.
for i in range(len(name)):
if len(name[i]) > 50:
name[i] = name[i][:51]+"..."
names[html] = name
elif (html == 'flipkart'):
name = self.htmls[html].find_all('div',{'class':'_4rR01T'})
name.extend(self.htmls[html].find_all('a',{'class' : 's1Q9rs'}))
name.extend(self.htmls[html].find_all('a',{'class' : 'IRpwTa'}))
self.clean_html_tags(name)
names[html] = name
return names
def get_prices(self):
"""
Extract the Specified fields using custom css properties and returns the extracted product names as a dictionary of product list.
"""
prices = {}
for html in self.htmls:
price = []
if (html == 'amazon'):
price = self.htmls[html].select('div.puisg-col-inner span.a-price-whole')
self.clean_html_tags(price)
# adds ₹ syblom in front of price and for none prices change them to PNA(product not found.)
for i in range(len(price)):
if price[i] != None:
price[i] = "₹"+str(price[i])
else:
price[i] = "PNA"
prices[html] = price
elif (html == 'flipkart'):
price = self.htmls[html].find_all('div',{'class':'_30jeq3'})
self.clean_html_tags(price)
prices[html] = price
return prices
class Presentation:
"""
Contains functions to Present the response data in a clean and meaningful table.
"""
def print_table(product_website):
"""
Uses the get_name and get_prices functions to get the product details and then displays them in a table.
"""
names = Request(p1).get_names()[product_website]
prices = Request(p1).get_prices()[product_website]
table = pt.PrettyTable(align='l')
table.field_names = ["S.NO",f" {product_website} Product Name", "Price (INR)"]
no = 1
for name, price in zip(names,prices):
table.add_row([no,name, price])
no+=1
print(table)
# Driver Code.
if __name__ == "__main__":
product_name = str(input("Enter Product name to search for: "))
p1 = Product(product_name)
Presentation.print_table("flipkart")
Presentation.print_table("amazon")