-
Notifications
You must be signed in to change notification settings - Fork 7
/
Text_Combiner.py
104 lines (90 loc) · 3.68 KB
/
Text_Combiner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Useful to combine multiple text files extracted using userscripts
import os
import ast
import glob
from csv import writer
folderPath = os.path.expanduser("~") + "\\Downloads\\"
textFileNameFormat = "link"
separator = " | "
outputFile = "Output_links.csv"
def findFiles(name_match, file_format):
match_format = "*" + name_match + "*" + file_format
fileList = glob.glob(match_format)
if not fileList:
print(f'No files found matching {match_format}')
else:
# sort the files by modified time
fileList.sort(key=os.path.getmtime)
print(f'Found {len(fileList)} files matching {match_format}')
return fileList
def AddToCSV(file_name, TextList):
with open(file_name, "a+", newline='', encoding='utf-8') as output_file:
csv_writer = writer(output_file)
csv_writer.writerow(TextList)
def combineText(folderPath, text_filename_format, separator, output_filename, delete=False):
if not os.path.exists(folderPath):
print(f"Error: {folderPath} not exists")
else:
os.chdir(folderPath)
fileList = findFiles(text_filename_format, ".txt")
for file in fileList:
print(file)
with open(file, 'r', encoding='utf-8') as f:
lineList = f.readlines()
TextList = []
for line in lineList:
TextList = line.split(separator)
Data_list = [elem.strip() if elem is not None else "" for elem in TextList]
#print(Data_list)
if Data_list:
AddToCSV(output_filename, Data_list)
if delete:
os.remove(file)
def extractLinks(folderPath, text_filename_format, separator, output_filename, delete=False):
if not os.path.exists(folderPath):
print(f"Error: {folderPath} not exists")
else:
os.chdir(folderPath)
fileList = findFiles(text_filename_format, ".txt")
linkList = set()
for file in fileList:
print(file)
with open(file, 'r', encoding='utf-8') as f:
lineList = f.readlines()
TextList = []
for line in lineList:
TextList = line.split(separator)
Data_list = [elem.strip() if elem is not None else "" for elem in TextList]
# print(Data_list)
if Data_list:
for item in Data_list:
if 'http' in item:
linkList.add(item)
if delete:
os.remove(file)
for link in linkList:
AddToCSV(output_filename, [link])
def jsonTxt2csv(folderPath, text_filename_format, output_filename, delete=False):
if not os.path.exists(folderPath):
print(f"Error: {folderPath} not exists")
else:
os.chdir(folderPath)
fileList = findFiles(text_filename_format, ".txt")
for file in fileList:
print(file)
jsonDict = dict()
with open(file, 'r', encoding='utf-8') as f:
try:
lineList = f.readlines()
#print(lineList)
for line in lineList:
json = ast.literal_eval(line)
for key, value in json.items():
AddToCSV(output_filename, [key, value])
except:
pass
if delete:
os.remove(file)
if __name__ == '__main__':
combineText(folderPath, "Shop", separator, "Output_Shop.csv", True)
jsonTxt2csv(folderPath, "link", separator, "Output_links.csv", True)