-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_crawler.py
55 lines (47 loc) · 1.6 KB
/
web_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import requests
from bs4 import BeautifulSoup
import html2text
def get_data_from_website(url):
"""
Retrieve text content and metadata from a given URL.
Args:
url (str): The URL to fetch content from.
Returns:
tuple: A tuple containing the text content (str) and metadata (dict).
"""
# Get response from the server
response = requests.get(url)
if response.status_code == 500:
print("Server error")
return
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Removing js and css code
for script in soup(["script", "style"]):
script.extract()
# Extract text in markdown format
html = str(soup)
html2text_instance = html2text.HTML2Text()
html2text_instance.images_to_alt = True
html2text_instance.body_width = 0
html2text_instance.single_line_break = True
text = html2text_instance.handle(html)
try:
page_title = soup.title.string.strip()
except:
page_title = url.path[1:].replace("/", "-")
meta_description = soup.find("meta", attrs={"name": "description"})
meta_keywords = soup.find("meta", attrs={"name": "keywords"})
if meta_description:
description = meta_description.get("content")
else:
description = page_title
if meta_keywords:
meta_keywords = meta_description.get("content")
else:
meta_keywords = ""
metadata = {'title': page_title,
'url': url,
'description': description,
'keywords': meta_keywords}
return text, metadata