Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

issues_feature_post_api_576 实现通过POST方式将数据推送到自定义接口 #577

Merged
merged 2 commits into from
Apr 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions docs/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -239,3 +239,15 @@ MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为
- **publish_tool**:存储微博的发布工具。

</details>

## 设置API接口POST联动(可选)

本部分是可选部分,如果不需要将爬取信息通过POST请求发送到指定API接口,可跳过这一步

请求数据格式为 `content-type : application/json`,接口响应返回也需要是 `content-type : application/json`,HTTP状态码为 `200`

数据主体与 `write_mode` 配置的 `json` 输出格式一致,是整页获取数据json,每页POST发送一次

`api_url` 为指定的API接口地址

`api_token` 为接口鉴权TOKEN,将在 Request Headers 中添加 `api-token` 字段,根据需要配置
4 changes: 4 additions & 0 deletions weibo_spider/config_sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,9 @@
"connection_string": "mongodb://admin:password@localhost:27017/weibo",
"dba_name": "",
"dba_password": ""
},
"post_config": {
"api_url": "",
"api_token": ""
}
}
4 changes: 2 additions & 2 deletions weibo_spider/config_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,14 @@ def validate_config(config):
sys.exit()

# 验证write_mode
write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql', 'sqlite', 'kafka']
write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql', 'sqlite', 'kafka','post']
if not isinstance(config['write_mode'], list):
logger.warning(u'write_mode值应为list类型')
sys.exit()
for mode in config['write_mode']:
if mode not in write_mode:
logger.warning(
u'%s为无效模式,请从txt、csv、json、mongo、sqlite, kafka和mysql中挑选一个或多个作为write_mode',
u'%s为无效模式,请从txt、csv、json、post、mongo、sqlite, kafka和mysql中挑选一个或多个作为write_mode',
mode)
sys.exit()

Expand Down
2 changes: 1 addition & 1 deletion weibo_spider/parser/comment_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def get_long_weibo(self):
# 3. 去掉所有 HTML 标签,但保留标签内的有效文本
new_content = fromstring(html_string).text_content()
# 4. 替换多个连续的 \n 为一个 \n
new_content = re.sub(r'\n+', '\n', new_content)
new_content = re.sub(r'\n+\s*', '\n', new_content)
weibo_content = handle_garbled(new_content)
if weibo_content is not None:
return weibo_content
Expand Down
6 changes: 6 additions & 0 deletions weibo_spider/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def __init__(self, config):
self.sqlite_config = config.get('sqlite_config')
self.kafka_config = config.get('kafka_config')
self.mongo_config = config.get('mongo_config')
self.post_config = config.get('post_config')
self.user_config_file_path = ''
user_id_list = config['user_id_list']
if FLAGS.user_id_list:
Expand Down Expand Up @@ -284,6 +285,11 @@ def initialize_info(self, user_config):

self.writers.append(KafkaWriter(self.kafka_config))

if 'post' in self.write_mode:
from .writer import PostWriter

self.writers.append(PostWriter(self.post_config))

self.downloaders = []
if self.pic_download == 1:
from .downloader import (OriginPictureDownloader,
Expand Down
3 changes: 2 additions & 1 deletion weibo_spider/writer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
from .txt_writer import TxtWriter
from .sqlite_writer import SqliteWriter
from .kafka_writer import KafkaWriter
from .post_writer import PostWriter

__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter, KafkaWriter]
__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter, KafkaWriter, PostWriter]
59 changes: 59 additions & 0 deletions weibo_spider/writer/post_writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import codecs
import json
import logging
import os
import requests

from .writer import Writer
from time import sleep
from requests.exceptions import RequestException

logger = logging.getLogger('spider.post_writer')

class PostWriter(Writer):
def __init__(self, post_config):
self.post_config = post_config
self.api_url = post_config['api_url']
self.api_token = post_config.get('api_token', None)
self.dba_password = post_config.get('dba_password', None)

def write_user(self, user):
self.user = user

def _update_json_data(self, data, weibo_info):
"""将获取到的微博数据转换为json输出模式一致"""
data['user'] = self.user.__dict__
if data.get('weibo'):
data['weibo'] += weibo_info
else:
data['weibo'] = weibo_info
return data

def send_post_request_with_token(self, url, data, token, max_retries, backoff_factor):
headers = {
'Content-Type': 'application/json',
'api-token': f'{token}',
}
for attempt in range(max_retries + 1):
try:
response = requests.post(url, json=data, headers=headers)
if response.status_code == requests.codes.ok:
return response.json()
else:
raise RequestException(f"Unexpected response status: {response.status_code}")
except RequestException as e:
if attempt < max_retries:
sleep(backoff_factor * (attempt + 1)) # 逐步增加等待时间,避免频繁重试
continue
else:
logger.error(f"在尝试{max_retries}次发出POST连接后,请求失败:{e}")

def write_weibo(self, weibos):
"""将爬到的信息POST到API"""
data = {}
data = self._update_json_data(data, [w.__dict__ for w in weibos])
if data:
self.send_post_request_with_token(self.api_url, data, self.api_token, 3, 2)
logger.info(u'%d条微博通过POST发送到 %s', len(weibos), self.api_url)
else:
logger.info(u'没有获取到微博,略过API POST')
Loading