forked from yangbo/mhtviewer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
executable file
·148 lines (119 loc) · 3.49 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# -*- coding: utf-8 -*-
import logging
import sys
import re
import quopri
import base64
import os
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def log(msg):
logging.info(msg)
def main():
args = sys.argv
if len(args) != 2:
print("Usage: extract.py <mht file>")
return
mht = sys.argv[1]
log('Extract multi-part of "%s" ...' % mht)
# open file
with open(mht, 'rb') as f:
for line in f.readlines():
processline(line)
# global variables
boundary = ""
state = 'none' # none, start-head, head-end
body = ""
content_type = ''
content_encoding = ''
content_location = ''
def processline(line):
global state
global body
getboundary(line)
sep = '------=%s' % boundary
sep_end = '------=%s--' % boundary
# print('sep: %s' % sep)
line_stripped = line.strip()
if line_stripped == sep or line_stripped == sep_end:
state = 'start-head'
log('status: %s' % state)
# to save block
save_block()
# reset contentXXX and body
reset_content()
return
if state == 'start-head':
if line.strip() == '':
state = 'head-end'
return
else:
read_header(line)
return
if state == 'head-end':
body = body + line
def save_block():
decoded_body = ''
if body == '':
return
else:
# decode
if content_encoding == 'quoted-printable':
decoded_body = quopri.decodestring(body)
if content_encoding == 'base64':
decoded_body = base64.b64decode(body)
log('will save file "%s", encoding=%s' % (content_location, content_encoding))
# save to file
save_file(decoded_body)
def save_file(decoded_body):
# empty then return
if not content_location:
return
# remove file://
location = re.sub('file://', '', content_location)
# remove C: driver path
location = re.sub(r'\\?\w:', '', location)
dirname, filename = os.path.split(location)
subdir = os.path.relpath('./'+dirname)
# mkdir at reverse second dir
try:
os.makedirs(subdir)
except OSError:
pass
relative_file_name = os.path.join(subdir, filename)
with open(relative_file_name, 'w') as f:
log('saved file: %s' % relative_file_name)
f.writelines(decoded_body)
def reset_content():
global body
global content_type
global content_location
global content_encoding
body = ''
content_type = ''
content_encoding = ''
content_location = ''
def read_header(line):
log('readHeader: %s' % line.strip())
global content_type
global content_location
global content_encoding
# parse contentType...
matcher = re.match('Content-Location:(.*)', line, flags=re.IGNORECASE)
if matcher:
# extract location
content_location = matcher.group(1).strip()
matcher = re.match('Content-Transfer-Encoding:(.*)', line, flags=re.IGNORECASE)
if matcher:
# extract encoding
content_encoding = matcher.group(1).strip()
matcher = re.match('Content-Type:(.*)', line, flags=re.IGNORECASE)
if matcher:
# extract type
content_type = matcher.group(1).strip()
def getboundary(line):
global boundary # set global variable
matcher = re.match(r'Content-Type: multipart/related; boundary="----=(.*)"', line)
if matcher:
boundary = matcher.group(1)
if __name__ == '__main__':
main()