-
Notifications
You must be signed in to change notification settings - Fork 78
/
Copy pathScraping.py
155 lines (122 loc) · 4.82 KB
/
Scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#class for scraping
import os
import urlparse
import urllib,urllib2
import re
from PDFMetaData import PDFMetaData
from ImageMetaData import ImageMetaData
import requests
from lxml import html
try:
from bs4 import BeautifulSoup
except Exception as e:
print("pip install beautifulsoup4")
exit(1)
class Scraping:
def scrapingBeautifulSoup(self,hostname):
try:
print("BeautifulSoup..............")
if hostname.startswith("http") == False:
response = requests.get("http://"+hostname,stream=True)
else:
response = requests.get(hostname,stream=True)
bs = BeautifulSoup(response.text, 'lxml')
for tagImage in bs.find_all("img"):
if tagImage['src'].startswith("http") == False:
download = url + tagImage['src']
else:
download = tagImage['src']
print download
# download images in images directory
r = requests.get(download)
f = open('images/%s' % download.split('/')[-1], 'wb')
f.write(r.content)
f.close()
except Exception,e:
print e
print "Error to connect with " + hostname + " for scraping the site"
pass
print("\nGet Metatada Image")
print "------------------------------------"
imageMetaData = ImageMetaData()
imageMetaData.printMetaData()
def scrapingImagesPdf(self,ip):
print("\nScraping the server for images and pdfs.... "+ ip)
try:
url = 'http://'+ip
print url
response = requests.get(url)
parsed_body = html.fromstring(response.text)
# Grab links to all images
images = parsed_body.xpath('//img/@src')
print 'Found %s images' % len(images)
#create directory for save images
os.system("mkdir images")
for image in images:
if image.startswith("http") == False:
download = url + image
else:
download = image
print download
# download images in images directory
r = requests.get(download,stream=True)
f = open('images/%s' % download.split('/')[-1], 'wb')
f.write(r.content)
f.close()
# Grab links to all pdf
pdfs = parsed_body.xpath('//a[@href[contains(., ".pdf")]]/@href')
#create directory for save pdfs
if len(pdfs) >0:
os.system("mkdir pdfs")
print 'Found %s pdf' % len(pdfs)
for pdf in pdfs:
if pdf.startswith("http") == False:
download = url + pdf
else:
download = pdf
print download
# download pdfs in pdf directory
r = requests.get(download,stream=True)
f = open('pdfs/%s' % download.split('/')[-1], 'wb')
f.write(r.content)
f.close()
except Exception,e:
print e
print "Error to connect with " + ip + " for scraping the site"
pass
print("\nGet Metatada Image")
print "------------------------------------"
imageMetaData = ImageMetaData()
imageMetaData.printMetaData()
print("\nGet Metatada PDF")
print "------------------------------------"
pdfMetaData = PDFMetaData()
pdfMetaData.printMetaData()
def getImgFromUrl(self, urlSource, extension):
"""
name: getImgFromUrl
brief: Get images from a url template.
param urlSource: Url from where get the links of the images.
param extension: Extension to add the regular expression.
return: All the links that match with the regular expresion.
"""
#check url starts with http
if urlSource.startswith("http") == False:
urlSource = "http://" + urlSource
# GET HTML
response = requests.get(urlSource,stream=True)
html = response.text
# REGULAR EXPRESION COMPILATION
expresion = r'<img src="([^"]+).' + extension + '"'
regexp = re.compile(expresion, re.I | re.MULTILINE | re.DOTALL)
# FIND ALL CASES OF THE REG. EXPR.
links = regexp.findall(html)
# CREATING A LIST WITH ALL THE LINKS THAT MATCH WITH THE REG. EXPR.
i=0
while i<len(links):
links[i]=links[i]+'.'+extension
print urlSource + links[i], "\n"
i += 1
return links