-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path92_97(scraping).py
101 lines (60 loc) · 2.05 KB
/
92_97(scraping).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import urllib.request as req ## it returns a file like obj
response = req.urlopen("https://stackoverflow.com/documentation/")
#print(response.code)
#print(response.read())
import urllib
query_param = {"usename" : "stackoverflow" , "password" : "me.em"}
query_encode = urllib.parse.urlencode(query_param).encode('utf8')
response_param = req.urlopen("https://stackoverflow.com/users/login" ,query_encode)
print(response_param.code)
# 92.5 BeautifulSoup
from bs4 import BeautifulSoup
import requests
"""
# Use a bage ready by obtaining it by request module
res = requests.get("https://www.codechef.com/problems/easy")
page = BeautifulSoup(res.text , 'lxml')
tags_tbl = page.select('table.dataTable') ## Select table tage with class name dataTable
print("tags_tbl",len(tags_tbl) )
"""
data = """
<ul>
<li class ="item"> l1 </li>
<li class ="item"> l2 </li>
<li class ="item"> l3 </li>
</ul>
"""
soup = BeautifulSoup(data , 'html.parser')
for l in soup.select("li.item"):
print(l.get_text())
html = '''<a href="some_url">next</a>
<span class="class"><a href="another_url">later</a></span>
<label> Name :</label>
Dana
'''
soup = BeautifulSoup(html , 'html.parser')
for a in soup.find_all('label'):
print ("Found the URL:", a.next_sibling.strip())
for a in soup.find_all('a', href=True):
print("Found the URL:", a['href'])
import requests
with requests.Session() as session:
# Set Cookies
session.get('http://httpbin.org/cookies/set?key=value')
# Get Cookies
response = session.get('http://httpbin.org/cookies')
print(response.text)
from requests import post
foo = post('http://httpbin.org/post', data = {'key':'value'})
print(foo.headers)
"""
Selenium works as we pretend like an actuall user,
It simulates a real user as some websites don't like to be scraped
browser = webdriver.Firefox()
browser.get('http://stackoverflow.com/questions?sort=votes') ##Load document
title = browser.find_element_by_css_selector('h1')
print(title)
"""
"""
Beautiful Soup automatically converts incoming documents to Unicode and outgoing documents to UTF-8.
"""