-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathData_clean_1
110 lines (77 loc) · 3.27 KB
/
Data_clean_1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
# coding: utf-8
# importing all the libraries
import xml.etree.ElementTree as ET
import string
import os
import re
# function to remove html tags from the text and everything inside the tags.
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub("[^a-zA-Z]", " ", str(raw_html))
return cleantext
# Function to clean text and output it to a text file in comma seperated format
fulltext = "fulltext"
xml =".xml"
def getcsv(outfile, parent_dir, word):
for root, dirs, files in os.walk(parent_dir): # To search through the directory in aloop
for name in files:
if name =='fulltext.xml': # finding fulltext.xml file within the directory
new_path = (os.path.join(root, name))
file_name = os.path.dirname(new_path) # getting the directory path as a variable
tree = ET.parse(new_path)
root = tree.getroot()
abstract = root.findall("./front/article-meta/abstract/") # searching the fulltext file to get abstract
for j in abstract:
new = (j.text)
new1 = cleanhtml(new) # applying the cleantext function
new1= new1.lower() # converting the entire text to lower case
input_str = re.sub(r'\d+', '', new1) #Regular Expression, or regex
#input_str = input_str.translate(string.maketrans("",""), string.punctuation)
new_1 = input_str.strip() #Remove spaces at the beginning and at the end of the string
outp = str(file_name) + "," + new1 + "," + word + "\n" # put output in a string
outfile.write(outp) # write to a text file
outfile.close() #close the text file
# using the function getcsv for true positives
outfile1 = open('true.txt','w')
parent_dir_true = "D:\\try1\\true"
word1 = "true"
getcsv(outfile1, parent_dir_true, word1)
# using the function getcsv for false positives
outfile2 = open('false.txt','w')
parent_dir_false = "D:\\try1\\false"
word2 = "false"
getcsv(outfile2, parent_dir_false, word2)
# function removing words like abstract etc.
def remove_abs(out_put, in_put):
with open(in_put, "r") as a_file: #open the file
for line in a_file: #loop through each word in the file
if len(line.split()) >2: # if the word length for the line is greater than 2
out_put.write(line) # write to a new file
out_put.close()
# using the function remove_abs
in_put1 = 'true.txt'
in_put2 = 'false.txt'
out_put1 = open('true_new.txt','w')
out_put2 = open('false_new.txt','w')
remove_abs(out_put1,in_put1)
remove_abs(out_put2, in_put2)
# merging two files (false positives and true positives)
data = data2 = ""
# Reading data from file1
with open('true_new.txt') as fp:
data = fp.read()
# Reading data from file2
with open('false_new.txt') as fp:
data2 = fp.read()
# Merging 2 files
# To add the data of file2
# from next line
data += "\n"
data += data2
with open ('OpenVirus.txt', 'w') as fp:
fp.write(data)
# converting the text file into csv file
import pandas as pd
read_file = pd.read_csv (r'OpenVirus.txt') #read the file as a pandas dataframe
read_file.to_csv (r'OpenVirus.csv', index=None) #write it to csv file