-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patharxiv_paper.py
127 lines (109 loc) · 3.8 KB
/
arxiv_paper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
Get arXiv papers
"""
import os
import json
import arxiv
def get_latest_papers(category, max_results=100):
"""
Get the latest papers from arXiv
:param category: the category of papers
:param max_results: the maximum number of papers to get
:return: a list of papers
"""
client = arxiv.Client()
search_query = f'cat:{category}'
search = arxiv.Search(
query=search_query,
max_results=max_results,
sort_by=arxiv.SortCriterion.SubmittedDate
)
papers = []
for result in client.results(search):
# Remove the version number from the id
paper_id = result.get_short_id()
version_pos = paper_id.find('v')
if version_pos != -1:
paper_id = paper_id[:version_pos]
paper = {
'title': result.title,
'id': paper_id,
'abstract': result.summary.replace('\n', ' '), # Remove line breaks
'url': result.entry_id,
'published': result.published.date().isoformat() # Get the date in ISO format
}
papers.append(paper)
return papers
def filter_papers_by_keyword(papers, keyword_list):
"""
Filter papers by keywords
:param papers: a list of papers
:param keyword_list: a list of keywords
:return: a list of filtered papers
"""
results = []
# Below is a less efficient way to filter papers by keywords
# keyword_list = [keyword.lower() for keyword in keyword_list]
# for paper in papers:
# if any(keyword in paper['abstract'].lower() for keyword in keyword_list):
# results.append(paper)
keyword_set = set(keyword.lower() for keyword in keyword_list)
for paper in papers:
if keyword_set & set(paper['abstract'].lower().split()):
results.append(paper)
return results
def deduplicate_papers(papers, file_path):
"""
Deduplicate the papers according to the previous records
:param papers: a list of papers
:param file_path: the file path of the previous records
:return: the deduplicated papers
"""
if os.path.exists(file_path):
with open(file_path, 'r') as f:
content = f.read()
if content:
content = json.loads(content)
# Filter out the duplicated papers by id
content_id = set(d['id'] for d in content)
papers = [d for d in papers if d['id'] not in content_id]
if len(set(d['id'] for d in papers)) == len(papers):
return papers
# Deduplicate papers while maintaining the order
# **Note**: Used in the case where multiple categories are involved
papers_id = set()
deduplicated_papers = []
for paper in papers:
if paper['id'] not in papers_id:
papers_id.add(paper['id'])
deduplicated_papers.append(paper)
return deduplicated_papers
def prepend_to_json_file(file_path, data):
"""
Prepend data to a JSON file
:param file_path: the file path
:param data: the data to prepend
"""
if os.path.exists(file_path):
with open(file_path, 'r') as f:
content = f.read()
if content:
content = json.loads(content)
else:
content = []
else:
content = []
with open(file_path, 'w') as f:
json.dump(data + content, f, indent=4)
if __name__ == '__main__':
papers = get_latest_papers('cs.CL', max_results=50)
print(json.dumps(papers, indent=4))
print()
keyword_list = ['safety', 'security', 'adversarial', 'jailbreak', 'backdoor', 'hallucination', 'victim']
results = filter_papers_by_keyword(papers, keyword_list)
print(json.dumps(results, indent=4))
print()
results = deduplicate_papers(results, 'papers.json')
print(json.dumps(results, indent=4))
print()
prepend_to_json_file('papers.json', results)