-
Notifications
You must be signed in to change notification settings - Fork 110
/
Copy pathpronom_ident.py
executable file
·102 lines (86 loc) · 3.4 KB
/
pronom_ident.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
"""
pronom-ident.py - Identify a bitstream against PRONOM; uses fido
"""
# https://github.com/anarchivist/fiwalk-dgi/blob/master/python/pronom_ident.py
# Author anarchivist
import os
import sys
import time
from optparse import OptionParser
from fido import fido
class FiwalkFido(fido.Fido):
"""docstring for FiwalkFido"""
def __init__(self, **kwargs):
fido.Fido.__init__(self, kwargs)
self.handle_matches = self.parse_matches
def identify_file(self, filename):
"""Identify the type of @param filename."""
self.current_file = filename
try:
t0 = time.clock()
f = open(filename, "rb")
size = os.stat(filename)[6]
self.current_filesize = size
bofbuffer, eofbuffer, __ = self.get_buffers(f, size, seekable=True)
matches = self.match_formats(bofbuffer, eofbuffer)
# from here is also repeated in walk_zip
# we should make this uniform in next version!
#
# filesize is made conditional because files with 0 bytes
# are falsely characterised being 'rtf'
# in these cases we try to match the extension instead
if len(matches) > 0 and self.current_filesize > 0:
return self.handle_matches(
filename, matches, time.clock() - t0, "signature"
)
elif len(matches) == 0 or self.current_filesize == 0:
matches = self.match_extensions(filename)
return self.handle_matches(
filename, matches, time.clock() - t0, "extension"
)
# till here matey!
if self.zip:
self.identify_contents(filename, type=self.container_type(matches))
except OSError:
# print >> sys.stderr, "FIDO: Error in identify_file: Path is {0}".format(filename)
sys.stderr.write(f"FIDO: Error in identify_file: Path is {filename}\n")
def parse_matches(self, fullname, matches, delta_t, matchtype=""):
out = {}
out["pronomSoftware"] = "fido " + fido.version
out["pronomTotalMatches"] = len(matches)
if len(matches) == 0:
out["pronomMatchType"] = "fail"
else:
i = 0
for f, s in matches:
i += 1
out["pronomMatchType"] = matchtype
out["pronomPuid"] = self.get_puid(f)
out["pronomFormatName"] = f.find("name").text
out["pronomSignatureName"] = s.find("name").text
mime = f.find("mime")
out["pronomFormatMimeType"] = mime.text if mime is not None else None
version = f.find("version")
out["pronomFormatVersion"] = (
version.text if version is not None else None
)
alias = f.find("alias")
out["pronomFormatAlias"] = alias.text if alias is not None else None
return out
def pronom_ident(fn):
f = FiwalkFido(quiet=True)
return f.identify_file(fn)
def main():
parser = OptionParser()
opts, args = parser.parse_args()
if len(args) < 1:
parser.print_help()
exit(-1)
filename = args[0]
out = pronom_ident(filename)
for k, v in out.items():
if v is not None:
print(k + ": " + str(v))
if __name__ == "__main__":
sys.exit(main())