-
Notifications
You must be signed in to change notification settings - Fork 0
/
weipu3.py
148 lines (126 loc) · 7.21 KB
/
weipu3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# -*-coding:utf-8-*-
import re
import sys
import os
from utils.net import Net
from utils.progress_bar import ProgressBar
class Weipu(object):
def __init__(self):
self.__net = Net()
self.__headers = self.__net.parse_form(
"config/weipu_header.conf", sep=":")
self.__get_headers = self.__net.parse_form(
"config/weipu_get_header.conf", sep=":")
# self.__ip = self.__net.get_proxy()
# self.__url = "http://qikan.cqvip.com/Search/SearchList"
self.__url = "http://qikan.cqvip.com/Search/SearchList"
self.__form = self.__net.parse_form("config/weipu_form.conf", sep="\t")
def form_weipu_search_unit(self, field_name, field_value, pre_logical="AND", exact="0") -> str:
return '{"FieldIdentifier":"%s","SearchKey":"%s","PreLogicalOperator":"%s","IsExact":"%s"}' % (field_name, field_value, pre_logical, exact)
def write_reponse(self, response):
with open("data/current_response_html.html", encoding="utf-8", mode="w") as rf:
rf.write(response.content)
def search_id(self, journal, page="1") -> []:
req_data = dict()
req_data["searchParamModel"] = '{"ObjectType":1,"SearchKeyList":[{"FieldIdentifier":"J","SearchKey":"%s","PreLogicalOperator":"","IsExact":"1"}],"SearchExpression":"","BeginYear":"2014","EndYear":"2019","JournalRange":"","DomainRange":"","PageSize":"100","PageNum":"%s","Sort":"0","ClusterFilter":"","SType":"","StrIds":"","UpdateTimeType":"","ClusterUseType":"Article","IsNoteHistory":1,"AdvShowTitle":"刊名=%s AND 年份:2014-2019","ObjectId":"","ObjectSearchType":"0","ChineseEnglishExtend":"0","SynonymExtend":"0","ShowTotalCount":"0","AdvTabGuid":"1db1e78c-4454-8675-d5b9-3eb71928b687"}' % (
journal, page, journal)
# print(req_data)
idset = self.__net.requests(
self.__url, req_data, headers=self.__headers, timeout=20).xpath("//@articleid")
return list(set(idset))
# def find_email(self, string) -> str:
# found = re.search('[a-zA-Z0-9_\-\+..]+@[a-zA-Z0-9_\-\+..]+', string)
# return found.group() if found else ""
def all_journal_ids(self, journal):
page = 1
while True:
ids = self.search_id(journal=journal, page=page)
if len(ids):
for i in ids:
yield i
else:
break
page += 1
def find_tel(self, string):
res = re.search("Tel[:: ]*([\d—\-]+)", string, flags=re.I)
if not res:
res = re.search("电话[:: ]*([\d—\-]+)", string, flags=re.I)
return res.group(1) if res else ""
def find_article_info(self, article_id) -> []:
tree = self.__net.requests(method="get", timeout=20, headers=self.__get_headers,
url="http://qikan.cqvip.com/Qikan/Article/Detail?id={}".format(article_id))
all_authors = [a.strip() for a in tree.xpath(
'//div[@class="author"]/span/a/span/text()')]
author_info = ''.join(tree.xpath('//div[@class="others"]/text()')
).strip().replace('.', '.').replace('\n', '').replace('\r', '').replace('\t', ';')
infos = re.split('通[讯信]作者', author_info)
if len(infos) == 1:
infos = re.split("Corresponding", author_info, flags=re.I) + [""]
first_author, com_author = infos[0].strip(), infos[1].strip()
# print(infos)
title = self.__net.xpath_first(tree, "//h1/text()")
organ = ';'.join(tree.xpath(
'//div[@class="organ"]/span/a/span/text()')).replace('\t', ';')
first_author_name = self.__net.find_name(first_author, all_authors)
if not first_author_name and all_authors:
first_author_name = all_authors[0]
com_author_name = self.__net.find_name(com_author, all_authors)
first_author_email = self.__net.find_email(first_author)
com_author_email = self.__net.find_email(com_author)
first_author_tel = self.find_tel(first_author)
com_author_tel = self.find_tel(com_author)
journal_info = self.__net.xpath_first(
tree, '//span[@class="vol"]/text()')
year = self.__net.search_str("(\d+)年", journal_info, 1)
issue = self.__net.search_str("(\d+)期", journal_info, 1)
journal_name = self.__net.xpath_first(
tree, '//span[@class="from"]/a/text()')
keywords = ','.join(tree.xpath(
'//div[@class="subject"]/span/a/text()'))
return [article_id, journal_name, year, issue, title, ','.join(all_authors), organ, keywords, author_info, com_author_name, com_author_email, com_author_tel, first_author_name, first_author_email, first_author_tel]
def run(self, journal_list_file):
base_path = os.path.dirname(os.path.abspath(__file__)) + '/'
results_file_name = base_path + "results/" + \
os.path.splitext(os.path.basename(journal_list_file))[
0] + "_article_information.txt"
done_journal_list_file_name = base_path + "data/done_journals.txt"
with open(journal_list_file) as jl:
journal_list = [jn.strip()
for jn in jl.read().split('\n') if jn.strip()]
colnames = ['文章ID', '期刊名', '年份', '期', '论文题目', '作者', '机构', '关键词',
'作者简介', '通讯作者姓名', '通讯作者邮箱', '通讯作者电话', '第一作者姓名', '第一作者邮箱', '第一作者电话']
try:
print("results stored in {}".format(results_file_name))
done = self.__net.get_file_column(results_file_name, number=1)
done = done.tolist()
except Exception:
# print(e)
print("did not found results table, create a new one")
done = []
try:
with open(done_journal_list_file_name) as djl:
done_journal_list = [jn.strip()
for jn in djl.read().split('\n') if jn.strip()]
except Exception:
done_journal_list = []
bar = ProgressBar(len(journal_list))
with open(results_file_name, mode='a', encoding='utf-8') as results, open(done_journal_list_file_name, mode='a', encoding='utf-8') as done_journal_file:
if not len(done):
results.write('\t'.join(colnames) + '\n')
for journal in journal_list:
if journal not in done_journal_list:
for paper_id in self.all_journal_ids(journal):
if paper_id not in done:
found_info = self.find_article_info(paper_id)
results.write('\t'.join(found_info) + '\n')
done.append(paper_id)
# results.flush()
done_journal_file.write(journal + '\n')
done_journal_list.append(journal)
# done_journal_file.flush()
bar.move()
else:
bar.move()
if __name__ == '__main__':
w = Weipu()
w.run(sys.argv[1])