-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathrunprov.py
270 lines (248 loc) · 9.63 KB
/
runprov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""MetadataRecord extractor for provenance information in DataLad's `run` records
Concept
-------
- Find all the commits with a run-record encoded in them
- the commit SHA provides @id for the "activity"
- pull out the author/date info for annotation purposes
- pull out the run record (at the very least to report it straight
up, but there can be more analysis of the input/output specs in
the context of the repo state at that point)
- pull out the diff: this gives us the filenames and shasums of
everything touched by the "activity". This info can then be used
to look up which file was created by which activity and report
that in the content metadata
Here is a sketch of the reported metadata structure::
{
"@context": "http://openprovenance.org/prov.jsonld",
"@graph": [
# agents
{
"@id": "Name_Surname<email@example.com>",
"@type": "agent"
},
...
# activities
{
"@id": "<GITSHA_of_run_record>",
"@type": "activity",
"atTime": "2019-05-01T12:10:55+02:00",
"rdfs:comment": "[DATALAD RUNCMD] rm test.png",
"prov:wasAssociatedWith": {
"@id": "Name_Surname<email@example.com>",
}
},
...
# entities
{
"@id": "SOMEKEY",
"@type": "entity",
"prov:wasGeneratedBy": {"@id": "<GITSHA_of_run_record>"}
}
...
]
}
"""
from .base import MetadataExtractor
from .. import (
get_file_id,
get_agent_id,
)
from datalad.support.json_py import (
loads as jsonloads,
load as jsonload,
)
from datalad.utils import (
Path,
)
import logging
lgr = logging.getLogger('datalad.metadata.extractors.runprov')
class RunProvenanceExtractor(MetadataExtractor):
def __call__(self, dataset, refcommit, process_type, status):
# shortcut
ds = dataset
# lookup dict to find an activity that generated a file at a particular
# path
path_db = {}
# all discovered activities indexed by their commit sha
activities = {}
for rec in yield_run_records(ds):
# run records are coming in latest first
for d in rec.pop('diff', []):
if d['path'] in path_db:
# records are latest first, if we have an entry, we already
# know about the latest change
continue
if d['mode'] == '000000':
# this file was deleted, hence it cannot possibly be part
# of the to-be-described set of files
continue
# record which activity generated this file
path_db[d['path']] = dict(
activity=rec['gitshasum'],
# we need to capture the gitshasum of the file as generated
# by the activity to be able to discover later modification
# between this state and the to-be-described state
gitshasum=d['gitshasum'],
)
activities[rec['gitshasum']] = rec
yielded_files = False
if process_type in ('all', 'content'):
for rec in status:
# see if we have any knowledge about this entry
# from any of the activity change logs
dbrec = path_db.get(
Path(rec['path']).relative_to(ds.pathobj).as_posix(),
{})
if dbrec.get('gitshasum', None) == rec.get('gitshasum', ''):
# the file at this path was generated by a recorded
# activity
yield dict(
rec,
metadata={
'@id': get_file_id(rec),
"@type": "entity",
"prov:wasGeneratedBy": {
"@id": dbrec['activity'],
},
},
type=rec['type'],
status='ok',
)
yielded_files = True
else:
# we don't know an activity that made this file, but we
# could still report who has last modified it
# no we should not, this is the RUN provenance extractor
# this stuff can be done by the core extractor
pass
if process_type in ('all', 'dataset'):
agents = {}
graph = []
for actsha in sorted(activities):
rec = activities[actsha]
agent_id = get_agent_id(rec['author_name'], rec['author_email'])
# do not report docs on agents immediately, but collect them
# and give unique list at the end
agents[agent_id] = dict(
name=rec['author_name'],
email=rec['author_email']
)
graph.append({
'@id': actsha,
'@type': 'activity',
'atTime': rec['commit_date'],
'prov:wasAssociatedWith': {
'@id': agent_id,
},
# TODO extend message with formatted run record
# targeted for human consumption (but consider
# possible leakage of information from sidecar
# runrecords)
'rdfs:comment': rec['message'],
})
# and now documents on the committers
# this is likely a duplicate of a report to be expected by
# the datalad_core extractor, but over there it is configurable
# and we want self-contained reports per extractor
# the redundancy will be eaten by XZ compression
for agent in sorted(agents):
rec = agents[agent]
graph.append({
'@id': agent,
'@type': 'agent',
'name': rec['name'],
'email': rec['email'],
})
if yielded_files or graph:
# we either need a context report for file records, or
# we have something to say about this dataset
# in general, one will not come without the other
yield dict(
metadata={
'@context': 'http://openprovenance.org/prov.jsonld',
'@graph': graph,
},
type='dataset',
status='ok',
)
def yield_run_records(ds):
def _finalize_record(r):
msg, rec = _split_record_message(r.pop('body', []))
r['message'] = msg
# TODO this can also just be a runrecord ID in which case we need
# to load the file and report its content
rec = jsonloads(rec)
if not isinstance(rec, dict):
# this is a runinfo file name
rec = jsonload(
str(ds.pathobj / '.datalad' / 'runinfo' / rec),
# TODO this should not be necessary, instead jsonload()
# should be left on auto, and `run` should save compressed
# files with an appropriate extension
compressed=True,
)
r['run_record'] = rec
return r
record = None
indiff = False
for line in ds.repo.call_git_items_(
['log', '-F',
'--grep', '=== Do not change lines below ===',
"--pretty=tformat:%x00%x00record%x00%n%H%x00%aN%x00%aE%x00%aI%n%B%x00%x00diff%x00",
"--raw", "--no-abbrev"]):
if line == '\0\0record\0':
indiff = False
# fresh record
if record:
yield _finalize_record(record)
record = None
elif record is None:
record = dict(zip(
('gitshasum', 'author_name', 'author_email', 'commit_date'),
line.split('\0')
))
record['body'] = []
record['diff'] = []
elif line == '\0\0diff\0':
indiff = True
elif indiff:
if not line.startswith(':'):
continue
diff = line[1:].split(' ')[:4]
diff.append(line[line.index('\t') + 1:])
record['diff'].append(
dict(zip(
('prev_mode', 'mode', 'prev_gitshasum', 'gitshasum',
'path'),
diff
))
)
else:
record['body'].append(line)
if record:
yield _finalize_record(record)
def _split_record_message(lines):
msg = []
run = []
inrec = False
for line in lines:
if line == "=== Do not change lines below ===":
inrec = True
elif line == "^^^ Do not change lines above ^^^":
inrec = False
elif inrec:
run.append(line)
else:
msg.append(line)
return '\n'.join(msg).strip(), ''.join(run)
# TODO report runrecord directory as content-needed, if configuration wants this
# information to be reported. However, such files might be used to prevent leakage
# of sensitive information....