-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfars_category.py
120 lines (97 loc) · 3.6 KB
/
fars_category.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import sqlite3
import requests
import json
import scripts.database as db
from random import randint
from time import sleep
from tqdm import tqdm
from datetime import datetime
from bs4 import BeautifulSoup
base_url = 'https://www.farsnews.ir'
db_file_path = './volume/fars_news.db'
db_connection = sqlite3.connect(db_file_path)
db_cursor = db_connection.cursor()
# Import Categories
with open('./statics/fars_news_categories.json', encoding="utf-8") as file:
categories = json.load(file)
try:
USER_TABLE_QUERY = '''CREATE TABLE IF NOT EXISTS news (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title CHAR(150),
short_link CHAR(100),
service CHAR(50),
subgroup CHAR(50),
abstract TEXT,
body TEXT,
published_datetime TIMESTAMP
);'''
db_cursor.executescript(USER_TABLE_QUERY)
except sqlite3.Error as error:
print(error)
db_connection.commit()
db_connection.close()
def extract_single_news_information(link, category):
try:
page = requests.get(link)
soup = BeautifulSoup( page.content, 'html.parser' )
# Get Published Datetime
published_date_object = soup.select('div.header time')
if (published_date_object):
published_date = datetime.strptime(published_date_object[0]['datetime'], '%m/%d/%Y %H:%M:%S %p')
else:
published_date = datetime.now().strftime('%m/%d/%Y %H:%M:%S %p')
# Get Title
title_object = soup.select('h1.title')
title = title_object[0].get_text().strip()
# Get Abstract
abstract_object = soup.select('h1.title + p')
abstract = abstract_object[0].get_text().strip()
# Get Service name and Subgroup Name
service_object = soup.select('div.header h2.category-name a')
if (len(list(service_object)) == 2):
service = service_object[0].get_text().strip()
subgroup = service_object[1].get_text().strip()
else:
service = categories[category]
subgroup = service_object[0].get_text().strip()
# Get ShortLink
short_link_object = soup.select('div.short-url')
short_link = short_link_object[0].get_text().strip()
#Get Tags
tags_list_object = soup.select('div.tags a')
tags_list = []
for tag in tags_list_object:
tags_list.append(tag.get_text().strip())
#Get Body
body_objects = soup.select('div[itemprop="articleBody"] > p')
paragraphs = []
for pr in body_objects:
paragraphs.append(pr.get_text().strip())
body = ' '.join(paragraphs)
# Insert into the DB
db.insert_news_in_database(
db_file=db_file_path,
title=title,
short_link=short_link,
service=service,
subgroup=subgroup,
abstract=abstract,
body=body,
published_datetime=published_date,
)
except:
print(link)
def main():
category = 'university'
total = 400
start_from = 0
for p in tqdm(range(start_from, start_from + total)):
delay = randint(1,5)
page = requests.get(f'https://www.farsnews.ir/{category}?p={p}')
soup = BeautifulSoup( page.content, 'html.parser' )
news_list = soup.select('div.last-news ul.news-list li > a')
for link in news_list:
extract_single_news_information(base_url + link['href'], category)
sleep(delay)
if __name__ == '__main__':
main()