Machine_Learning/Data_clean_1

#!/usr/bin/env python
# coding: utf-8

# importing all the libraries
import xml.etree.ElementTree as ET
import string
import os  
import re


# function to remove html tags from the text and everything inside the tags.
def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub("[^a-zA-Z]", " ", str(raw_html))

  return cleantext


# Function to clean text and output it to a text file in comma seperated format
fulltext = "fulltext"
xml =".xml"

def getcsv(outfile, parent_dir, word):

    for root, dirs, files in os.walk(parent_dir): # To search through the directory in aloop
        for name in files:
            if name =='fulltext.xml': # finding fulltext.xml file within the directory
                new_path = (os.path.join(root, name))
                file_name = os.path.dirname(new_path) # getting the directory path as a variable
                tree = ET.parse(new_path)
                root = tree.getroot()
                abstract = root.findall("./front/article-meta/abstract/") # searching the fulltext file to get abstract

                for j in abstract:
                    new = (j.text)
                    new1 = cleanhtml(new) # applying the cleantext function 
                    new1= new1.lower() # converting the entire text to lower case
                    input_str = re.sub(r'\d+', '', new1)  #Regular Expression, or regex 
                    #input_str = input_str.translate(string.maketrans("",""), string.punctuation)
                    new_1 = input_str.strip() #Remove spaces at the beginning and at the end of the string

                    outp = str(file_name) +  "," + new1 +  "," + word  + "\n" # put output in a string

                    outfile.write(outp) # write to a text file

    outfile.close() #close the text file

# using the function getcsv for true positives
outfile1 = open('true.txt','w')
parent_dir_true  = "D:\\try1\\true"
word1 = "true"

getcsv(outfile1, parent_dir_true, word1)

# using the function getcsv for false positives
outfile2 = open('false.txt','w')
parent_dir_false  = "D:\\try1\\false"
word2 = "false"

getcsv(outfile2, parent_dir_false, word2)


# function removing words like abstract etc.
def remove_abs(out_put, in_put):
    with open(in_put, "r") as a_file: #open the file
      for line in a_file: #loop through each word in the file
        if len(line.split()) >2: # if the word length for the line is greater than 2
            out_put.write(line) # write to a new file

    out_put.close()

    
# using the function remove_abs
in_put1 = 'true.txt'
in_put2 = 'false.txt'

out_put1 = open('true_new.txt','w')
out_put2 = open('false_new.txt','w')

remove_abs(out_put1,in_put1)
remove_abs(out_put2, in_put2)

# merging two files (false positives and true positives)
  
data = data2 = "" 
  
# Reading data from file1 
with open('true_new.txt') as fp: 
    data = fp.read() 
  
# Reading data from file2 
with open('false_new.txt') as fp: 
    data2 = fp.read() 
  
# Merging 2 files 
# To add the data of file2 
# from next line 
data += "\n"
data += data2 
  
with open ('OpenVirus.txt', 'w') as fp: 
    fp.write(data) 


# converting the text file into csv file
import pandas as pd

read_file = pd.read_csv (r'OpenVirus.txt') #read the file as a pandas dataframe
read_file.to_csv (r'OpenVirus.csv', index=None) #write it to csv file