-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawlBlog.py
44 lines (39 loc) · 1.29 KB
/
crawlBlog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# coding:utf8
from WeixinUtils import NewsItem
__author__ = 'Jayvee'
import urllib2
from bs4 import BeautifulSoup
def get_archives(maxcount=10):
"""
获取个人博客的文章列表
:rtype : object
:return: postitem类型的list
"""
baseurl = "http://jayveehe.github.io"
req = urllib2.Request("http://jayveehe.github.io/archives/")
html = urllib2.urlopen(req)
text = html.read()
soup = BeautifulSoup(text)
# print soup.prettify()
count = 1
postlist = []
# 添加博客文章列表的链接
mainitem = NewsItem("http://jayveehe.github.io/archives/", "Jayvee's Blog文章列表",
"http://jayveestorage.qiniudn.com/blogtitle.jpg")
postlist.append(mainitem)
for article in soup.find_all("a", "archive-article-title"):
item = NewsItem(baseurl + article["href"], article.text, "http://jayveehe.github.io/img/avatar.jpg")
# print count, ":", article
# print article["href"]
# print item.url
# print item.title
if count < maxcount:
count += 1
postlist.append(item)
else:
break
# print postlist.__len__()
return postlist
# postlist = get_archives()
# mainbody = WeixinUtils.make_news(postlist, 123, 321)
# print mainbody