-
Notifications
You must be signed in to change notification settings - Fork 1
/
FileCompression.py
191 lines (165 loc) · 4.38 KB
/
FileCompression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
class FileCompression:
def __init__(self):
self.root=None
self.codes={}
self.get_ch={}
#sending the counter into insert function and forming the trie
def insert(self,c):
l=[]
#storing all the leaf nodes in a list
for (k,v) in c:
n=Node()
n.weight=v
n.key=k
l.append(n)
#using min heap to get the desired code for a character
H=BinaryHeap(l)
H.BuildHeap()
#creating the required trie
for i in range(1,len(H.E)-1):
mn=Node()
mn.right=H.extractMin()
mn.left=H.extractMin()
mn.weight=mn.right.weight+mn.left.weight
H.insert(mn)
self.root=mn #assigning the top most node to the root
#this function is used to obtain the codes
def codeformation(self,p,str1):
if p==None:
return
if p.key!=None:
self.codes[p.key]=str1
self.get_ch[str1]=p.key
self.codeformation(p.left,str1+'0')
self.codeformation(p.right,str1+'1')
#this function is used to encode the given text using the codes formed
def get_encodedData(self,text):
encoded_text=""
for ch in text:
encoded_text+=self.codes[ch]
return encoded_text
#this function fills the extra bits required
def get_padded_text(self,encoded_text):
padded_text=encoded_text
extra=8-len(encoded_text)%8
for i in range(extra):
padded_text+='0'
pad_info="{0:08b}".format(extra)
padded_text=pad_info+padded_text
return padded_text
#this function generates the bytes to be written
def get_ByteArray(self,text):
if ((len(text)%8)!=0):
print("some error in padding ,try again")
exit(0)
b=bytearray()
for i in range(0,len(text),8):
byte=text[i:i+8]
b.append(int(byte,2))
return b
#this is the actual funtion that performs the compression
def compress(self,f1):
from collections import Counter
outfile=f1[:-4]+'.bin'
with open(f1,'r') as f ,open(outfile,'wb') as comp:
data=f.read()
data=data.rstrip()
c = Counter(data)
c=list(c.most_common())
self.insert(c)
self.codeformation(self.root,'')
encoded_text=self.get_encodedData(data)
padded_text=self.get_padded_text(encoded_text)
b=self.get_ByteArray(padded_text)
comp.write(bytes(b))
print()
print("file ***",f1,"*** is compressed and stored in ***",outfile,"*** ")
print()
#this function performs the decompression of the previous file compressed
def decompress(self,f):
outfile=f[:-4]+'_decomp.txt'
with open(f,'rb') as f1 ,open(outfile,'w') as decomp:
bit_string=""
byte=f1.read(1)
while(byte):
byte=ord(byte)
bit_string+=bin(byte)[2:].rjust(8,'0')
byte=f1.read(1)
encoded_text=self.rem_padding(bit_string)
text=self.decode(encoded_text)
decomp.write(text)
print()
print("file ***",f,"*** is decompressed and stored as ***",outfile,"*** ")
print()
#this function removes the padding done previously
def rem_padding(self,bit_string):
pad_info=bit_string[0:8]
extra_pad=int(pad_info,2)
encoded_text=bit_string[8:]
encoded_text=encoded_text[:-1*extra_pad]
return encoded_text
#this function decodes using the codes obtained above
def decode(self,encoded_text):
ch_code=''
decoded_text=''
for bit in encoded_text:
ch_code+=bit
if(ch_code in self.get_ch):
decoded_text+=self.get_ch[ch_code]
ch_code=''
return decoded_text
class Node: #the trienode we use
def __init__(self):
self.weight=None
self.left=None
self.right=None
self.key=None
class BinaryHeap: #this is used to obtain the desired codes
def __init__(self,L):
self.E=[None]+L
self.l=len(self.E)
self.BuildHeap()
def heapify(self,i):
if (i<=(self.l-1)/2) and (i>0):
if(2*i+1<self.l):
t2=self.E[2*i+1].weight
t1=self.E[2*i].weight
if (2*i+1<self.l) and (self.E[i].weight>t2) and (t1>t2):
t=self.E[2*i+1]
self.E[2*i+1]=self.E[i]
self.E[i]=t
k=i*2+1
elif(self.E[i].weight>t1):
t=self.E[2*i]
self.E[2*i]=self.E[i]
self.E[i]=t
k=2*i
else:
k=-1
self.heapify(k)
def BuildHeap(self):
for i in range(int((self.l-1)/2),0,-1):
self.heapify(i)
def extractMin(self):
t=self.E[1]
self.E[1]=self.E[self.l-1]
self.E[self.l-1]=t
t=self.E[self.l-1]
del self.E[self.l-1]
self.l-=1
self.heapify(1)
return t
def insert(self,k):
self.l+=1
self.E.append(k)
i=int((self.l-1)/2)
while i>0:
self.heapify(i)
i=int(i/2)
def main():
f1=input("enter the file_name (path):")
FC=FileCompression()
FC.compress(f1)
FC.decompress(f1[:-3]+'bin')
if __name__ == '__main__':
main()