-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopics_creator_ubuntu_next_generation.py
42 lines (36 loc) · 2.23 KB
/
topics_creator_ubuntu_next_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from bs4 import BeautifulSoup
import os
# Set the path to the folder containing the XML files
xml_folder = r'/home/masteripper/terrier-project-5.5/WPI_60K/50_topics_eval_students/topics'
# Set the path to the output TREC topic file
trec_file = r'/home/masteripper/terrier-project-5.5/WPI_60K/queries_ubuntu.txt'
# Open the TREC topic file for writing
with open(trec_file, 'w',encoding='utf-8') as output_file:
# Loop over the XML files in the folder
for file_name in os.listdir(xml_folder):
if file_name.endswith('.xml'):
# Open the XML file
print(file_name)
with open(os.path.join(xml_folder, file_name), 'r',encoding='utf-8') as xml_file:
xml_data = xml_file.read()
# Parse the XML data with BeautifulSoup
soup = BeautifulSoup(xml_data, 'xml')
# Loop over the <topic> elements in the XML file
#for content in soup.contents:
# Extract the topic ID, title, and description
topic_id = soup.find('patent-document').attrs['ucid']
title = soup.find('invention-title').contents[0]
abstract = soup.find('abstract').contents[0].contents[0]
claims = soup.find('claims').contents[1].text
short_descr =soup.find('description').text
summary=""
# if full_description.find('FIELD') >0:
# short_descr =soup.find('description').text.split('FIELD')[1].split('CITATION')[0].split('.')[0]
# elif full_description.find('BRIEF SUMMARY OF THE INVENTION')> 0:
# short_descr = soup.find('description').text.split('BRIEF SUMMARY OF THE INVENTION')[1].split('DETAILED DESCRIPTION OF THE INVENTION')[0].split('.')[0]
# # Write the topic data to the TREC topic file
# else :
# short_descr = soup.find('description').text[:500]
if short_descr.find(' THE INVENTION') > 0:
summary = soup.find('description').text.split(' THE INVENTION')[1].split('<')[0]
output_file.write(f'<top>\n<num>{topic_id}</num>\n<title>{title}</title>\n<abstract>{abstract}</abstract>\n<claims>{claims}</claims>\n<shortdesc>{short_descr}</shortdesc>\n<summary>{summary}</summary>\n</top>\n')