Skip to content

Commit

Permalink
新增友链获取策略的common规则
Browse files Browse the repository at this point in the history
代码优化
release 4.2.5
  • Loading branch information
hiltay committed Feb 11, 2022
1 parent c0ccbeb commit 3e3fb9c
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 138 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

⭐从4.1.3版本开始,一定要在配置项中配置友链页的获取策略
```
目前 release 4.2.4 版本:
目前 release 4.2.5 版本:
- 支持 gitee 和 github 上的 issuse 友链获取
- 支持butterfly、volantis、matery、sakura、fluid、nexmoe、Yun、stun、stellar主题的友链和文章获取
- 支持butterfly、volantis、matery、sakura、fluid、nexmoe、Yun、stun、stellar、next主题的友链和文章获取
- 支持feed订阅规则,如atom、rss等规则(支持wordpress类型的博客)
- 支持自定义订阅后缀
- 支持站点屏蔽
Expand All @@ -19,7 +19,7 @@
- 新增数据存储配置,提供多种存储方式
- 新增部署方式配置,可部署在本地服务端
- 将api整合到主仓库
- 新增next四种主题的文章获取,与Yun规则合并,暂不支持友链页获取
- 新增友链获取策略的common规则
bug修复:
- wordpress类型博客的时间格式问题
Expand Down
3 changes: 2 additions & 1 deletion hexo_circle_of_friends/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# - default:默认。指定友链页主题。示例:如果您的友链页为https://www.yyyzyyyz.cn/link/,请选择butterfly,以此类推
# - incompat:如果theme中不支持您的主题,请选择此项。此时建议使用配置项友链
# theme:必填,可选参数如下(这些是目前支持的主题):
# - common: 通用主题,请参考:https://hexo-circle-of-friends-doc.vercel.app/#/developmentdoc?id=%e5%8f%8b%e9%93%be%e9%a1%b5%e9%80%82%e9%85%8d
# - butterfly:butterfly主题
# - fluid:fluid主题
# - matery:matery主题
Expand Down Expand Up @@ -97,7 +98,7 @@

##############################除非您了解本项目,否则请勿修改以下内容################################

VERSION = "4.2.4"
VERSION = "4.2.5"

# debug
# debug模式
Expand Down
193 changes: 59 additions & 134 deletions hexo_circle_of_friends/spiders/hexo_circle_of_friends.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from hexo_circle_of_friends.utils.get_url import get_theme_url, Yun_async_link_handler
from hexo_circle_of_friends.utils.regulations import *
from hexo_circle_of_friends.utils.process_time import format_time

# from hexo_circle_of_friends import items todo use items

# post_parsers = ["theme_butterfly_parse"]
Expand All @@ -19,16 +20,18 @@
]

feed_suffix = [
"atom.xml","feed/atom","rss.xml","rss2.xml","feed"
"atom.xml", "feed/atom", "rss.xml", "rss2.xml", "feed"
]


class CRequest(Request):
def __init__(self, url, callback=None, meta=None, dont_filter=True,
errback=None,
*args, **kwargs):
super(CRequest, self).__init__(url, callback, meta=meta,dont_filter=dont_filter,
super(CRequest, self).__init__(url, callback, meta=meta, dont_filter=dont_filter,
errback=errback, *args, **kwargs)


class FriendpageLinkSpider(scrapy.Spider):
name = 'hexo_circle_of_friends'
allowed_domains = ['*']
Expand All @@ -39,7 +42,7 @@ def __init__(self, name=None, **kwargs):
self.friend_list = queue.Queue()
self.today = datetime.datetime.now().strftime('%Y-%m-%d')

super(FriendpageLinkSpider,self).__init__(name, **kwargs)
super(FriendpageLinkSpider, self).__init__(name, **kwargs)

def start_requests(self):
# 从配置文件导入友链列表
Expand Down Expand Up @@ -141,11 +144,11 @@ def friend_poor_parse(self, response):
friend[1] += "/" if not friend[1].endswith("/") else ""
if settings.SETTINGS_FRIENDS_LINKS['enable'] and len(friend) == 4:
url = friend[1] + friend[3]
yield CRequest(url,self.post_feed_parse,meta={"friend": friend},errback=self.errback_handler)
yield CRequest(url, self.post_feed_parse, meta={"friend": friend}, errback=self.errback_handler)
self.friend_list.put(friend[:3])
continue
self.friend_list.put(friend)
for r in self.start_post_requests(friend[1],post_parsers,feed_suffix,meta={"friend": friend}):
for r in self.start_post_requests(friend[1], post_parsers, feed_suffix, meta={"friend": friend}):
yield r

# friend = ['小冰博客', 'https://blog.zzbd.org/', 'https://zfe.space/images/headimage.png']
Expand All @@ -163,14 +166,14 @@ def friend_poor_parse(self, response):
userdata["userdata"] = "userdata"
yield userdata

def start_post_requests(self,domain,parsers,suffixs,meta,errback=None):
def start_post_requests(self, domain, parsers, suffixs, meta, errback=None):
errback = self.errback_handler if not errback else ...
for p in parsers:
parser = getattr(self,p)
parser = getattr(self, p)
if p == "post_feed_parse":
for suffix in suffixs:
yield CRequest(domain+suffix,parser,meta,errback=errback)
yield CRequest(domain,parser,meta,errback=errback)
yield CRequest(domain + suffix, parser, meta, errback=errback)
yield CRequest(domain, parser, meta, errback=errback)

def post_feed_parse(self, response):
# print("post_feed_parse---------->" + response.url)
Expand Down Expand Up @@ -219,78 +222,49 @@ def post_feed_parse(self, response):

def theme_butterfly_parse(self, response):
# print("theme_butterfly_parse---------->" + response.url)
rule = "butterfly"
friend = response.meta.get("friend")
titles = response.css("#recent-posts .recent-post-info a:first-child::text").extract()
partial_l = response.css("#recent-posts .recent-post-info a:first-child::attr(href)").extract()
createds = response.css("#recent-posts .recent-post-info .post-meta-date time:first-of-type::text").extract()
updateds = response.css("#recent-posts .recent-post-info .post-meta-date time:nth-of-type(2)::text").extract()
try:
l = len(partial_l) if len(partial_l) < 5 else 5
titles = self.process_title(titles, l)
createds, updateds = self.process_time(createds, updateds, l)
init_post_info = self.init_post_info(friend, "butterfly")
for i in range(l):
link = self.process_link(partial_l[i], friend[1])
yield self.generate_postinfo(
init_post_info,
titles[i],
createds[i] if createds else self.today,
updateds[i] if updateds else self.today,
link
)
for post_info in self.process_theme_postinfo(friend, partial_l, titles, createds, updateds, rule):
yield post_info
except:
pass

def theme_fluid_parse(self, response):
# print("theme_fluid_parse---------->" + response.url)
rule = "fluid"
friend = response.meta.get("friend")
titles = response.css("#board .index-header a::text").extract()
partial_l = response.css("#board .index-header a::attr(href)").extract()
createds = response.css("#board .post-meta time::text").extract()
updateds = []
try:
l = len(partial_l) if len(partial_l) < 5 else 5
titles = self.process_title(titles, l)
createds, updateds = self.process_time(createds, updateds, l)
init_post_info = self.init_post_info(friend, "fluid")
for i in range(l):
link = self.process_link(partial_l[i], friend[1])
yield self.generate_postinfo(
init_post_info,
titles[i],
createds[i] if createds else self.today,
updateds[i] if updateds else self.today,
link
)
for post_info in self.process_theme_postinfo(friend, partial_l, titles, createds, updateds, rule):
yield post_info
except:
pass

def theme_matery_parse(self, response):
# print("theme_matery_parse---------->" + response.url)
rule = "matery"
friend = response.meta.get("friend")
titles = response.css("#articles .card .card-title::text").extract()
partial_l = response.css("#articles .card a:first-child::attr(href)").extract()
createds = response.css("#articles .card span.publish-date").re("\d{4}-\d{2}-\d{2}")
updateds = []
try:
l = len(partial_l) if len(partial_l) < 5 else 5
titles = self.process_title(titles, l)
createds, updateds = self.process_time(createds, updateds, l)
init_post_info = self.init_post_info(friend, "matery")
for i in range(l):
link = self.process_link(partial_l[i], friend[1])
yield self.generate_postinfo(
init_post_info,
titles[i],
createds[i] if createds else self.today,
updateds[i] if updateds else self.today,
link
)
for post_info in self.process_theme_postinfo(friend, partial_l, titles, createds, updateds, rule):
yield post_info
except:
pass

def theme_sakura_parse(self, response):
# print("theme_sakura_parse---------->" + response.url)
rule = "sakura"
friend = response.meta.get("friend")
titles = response.css("#main a.post-title h3::text").extract()
if not titles:
Expand All @@ -308,149 +282,103 @@ def theme_sakura_parse(self, response):
createds = response.css("#main .post-date::text").re("\d{4}-\d{1,2}-\d{1,2}")
updateds = []
try:
l = len(links) if len(links) < 5 else 5
titles = self.process_title(titles, l)
createds, updateds = self.process_time(createds, updateds, l)
init_post_info = self.init_post_info(friend, "sakura")
for i in range(l):
link = self.process_link(links[i], friend[1])
yield self.generate_postinfo(
init_post_info,
titles[i],
createds[i] if createds else self.today,
updateds[i] if updateds else self.today,
link
)
for post_info in self.process_theme_postinfo(friend, links, titles, createds, updateds, rule):
yield post_info
except:
pass

def theme_volantis_parse(self, response):
# print("theme_volantis_parse---------->" + response.url)
rule = "volantis"
friend = response.meta.get("friend")
titles = response.css(".post-list .article-title a::text").extract()
partial_l = response.css(".post-list .article-title a::attr(href)").extract()
createds = response.css(".post-list .meta-v3 time::text").extract()
updateds = []
try:
l = len(partial_l) if len(partial_l) < 5 else 5
titles = self.process_title(titles, l)
createds, updateds = self.process_time(createds, updateds, l)
init_post_info = self.init_post_info(friend, "volantis")
for i in range(l):
link = self.process_link(partial_l[i], friend[1])
yield self.generate_postinfo(
init_post_info,
titles[i],
createds[i] if createds else self.today,
updateds[i] if updateds else self.today,
link
)
for post_info in self.process_theme_postinfo(friend, partial_l, titles, createds, updateds, rule):
yield post_info
except:
pass

def theme_nexmoe_parse(self, response):
# print("theme_nexmoe_parse---------->" + response.url)
rule = "nexmoe"
friend = response.meta.get("friend")
titles = response.css("section.nexmoe-posts .nexmoe-post h1::text").extract()
partial_l = response.css("section.nexmoe-posts .nexmoe-post>a::attr(href)").extract()
createds = response.css("section.nexmoe-posts .nexmoe-post-meta a:first-child::text").extract()
updateds = []
try:
l = len(partial_l) if len(partial_l) < 5 else 5
titles = self.process_title(titles, l)
createds, updateds = self.process_time(createds, updateds, l)
init_post_info = self.init_post_info(friend, "nexmoe")
for i in range(l):
link = self.process_link(partial_l[i], friend[1])
yield self.generate_postinfo(
init_post_info,
titles[i],
createds[i] if createds else self.today,
updateds[i] if updateds else self.today,
link
)
for post_info in self.process_theme_postinfo(friend, partial_l, titles, createds, updateds, rule):
yield post_info
except:
pass


def theme_stun_parse(self, response):
# print("theme_stun_parse---------->" + response.url)
rule = "stun"
friend = response.meta.get("friend")
titles = response.css("article .post-title__link::text").extract()
partial_l = response.css("article .post-title__link::attr(href)").extract()
createds = response.css("article .post-meta .post-meta-item--createtime .post-meta-item__value::text").extract()
updateds = response.css("article .post-meta .post-meta-item--updatetime .post-meta-item__value::text").extract()
try:
l = len(partial_l) if len(partial_l) < 5 else 5
titles = self.process_title(titles, l)
createds, updateds = self.process_time(createds, updateds, l)
init_post_info = self.init_post_info(friend, "stun")
for i in range(l):
link = self.process_link(partial_l[i], friend[1])
yield self.generate_postinfo(
init_post_info,
titles[i],
createds[i] if createds else self.today,
updateds[i] if updateds else self.today,
link
)
for post_info in self.process_theme_postinfo(friend, partial_l, titles, createds, updateds, rule):
yield post_info
except:
pass

def theme_stellar_parse(self, response):
# print("theme_stellar_parse---------->" + response.url)
rule = "stellar"
friend = response.meta.get("friend")
titles = response.css(".post-list .post-title::text").extract()
partial_l = response.css(".post-list .post-card::attr(href)").extract()
createds = response.css("#post-meta time::attr(datetime)").extract()
updateds = []
try:
l = len(partial_l) if len(partial_l) < 5 else 5
titles = self.process_title(titles, l)
createds, updateds = self.process_time(createds, updateds, l)
init_post_info = self.init_post_info(friend, "stellar")
for i in range(l):
link = self.process_link(partial_l[i], friend[1])
yield self.generate_postinfo(
init_post_info,
titles[i],
createds[i] if createds else self.today,
updateds[i] if updateds else self.today,
link
)
for post_info in self.process_theme_postinfo(friend, partial_l, titles, createds, updateds, rule):
yield post_info
except:
pass

def theme_next_parse(self,response):
def theme_next_parse(self, response):
# print("theme_next_parse---------->" + response.url)
rule = "next/Yun"
friend = response.meta.get("friend")
base_css = ["article h2","article .post-title","article .post-title-link"]
base_css = ["article h2", "article .post-title", "article .post-title-link"]
links_l = []
for css in base_css:
links = response.css("%s a:first-child::attr(href)"%css).extract()
links = response.css("%s a:first-child::attr(href)" % css).extract()
links_l.append(len(links))
ind = links_l.index(max(links_l))
links = response.css("%s a:first-child::attr(href)" % base_css[ind]).extract()
titles = response.css("%s a:first-child::text" % base_css[ind]).extract()
createds = response.css("article time[itemprop*=dateCreated]::text").extract()
updateds = response.css("article time[itemprop=dateModified]::text").extract()
try:
l = len(links) if len(links) < 5 else 5
titles = self.process_title(titles, l)
createds, updateds = self.process_time(createds, updateds, l)
init_post_info = self.init_post_info(friend, "next/Yun")
for i in range(l):
link = self.process_link(links[i], friend[1])
yield self.generate_postinfo(
init_post_info,
titles[i],
createds[i] if createds else self.today,
updateds[i] if updateds else self.today,
link
)
for post_info in self.process_theme_postinfo(friend, links, titles, createds, updateds, rule):
yield post_info
except:
pass
pass

def process_theme_postinfo(self, friend, links, titles, createds, updateds, rule):
l = len(links) if len(links) < 5 else 5
titles = self.process_title(titles, l)
createds, updateds = self.process_time(createds, updateds, l) # 要么为None,要么为空,要么有长度
init_post_info = self.init_post_info(friend, rule)
if not createds and not updateds:
raise
for i in range(l):
link = self.process_link(links[i], friend[1])
yield self.generate_postinfo(
init_post_info,
titles[i],
createds[i],
updateds[i],
link
)

def init_post_info(self, friend, rule):
post_info = {
Expand Down Expand Up @@ -522,6 +450,3 @@ def errback_handler(self, error):
def typecho_errback_handler(self, error):
yield Request(error.request.url, callback=self.post_feed_parse, dont_filter=True, meta=error.request.meta,
errback=self.errback_handler)



Loading

0 comments on commit 3e3fb9c

Please sign in to comment.