-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsc2_iff_parse.py
362 lines (326 loc) · 15.2 KB
/
sc2_iff_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
#!/usr/bin/env python
import collections
import itertools
from utils import open_file, parse_uint8, write_file_contents, get_padded_bytes, parse_int32, parse_uint32
SC2_SIZE_DICT = collections.OrderedDict((('CNAM', 32), ('MISC', 4800), ('ALTM', 32768), ('XTER', 16384), ('XBLD', 16384),
('XZON', 16384), ('XUND', 16384), ('XTXT', 16384), ('XLAB', 6400), ('XMIC', 1200),
('XTHG', 480), ('XBIT', 16384), ('XTRF', 4096), ('XPLT', 4096), ('XVAL', 4096), ('XCRM', 4096),
('XPLC', 1024), ('XFIR', 1024), ('XPOP', 1024), ('XROG', 1024), ('XGRP', 3328), ('TEXT', 0), ('SCEN', 0), ('PICT', 0),))
SCENARIO_CHUNKS = ('TEXT', 'SCEN', 'PICT')
SC2 = True
# Exception classes so that raised exceptions can be more descriptive.
class IFFParse(Exception):
"""
Base class for exceptions.
"""
pass
class SC2Parse(IFFParse):
"""
Exceptions for when parsing a SC2 file fails.
"""
def __init__(self, message):
self.message = message
class MIFFParse(IFFParse):
"""
Exceptions for when parsing a MIFF file fails.
"""
def __init__(self, message):
self.message = message
# Functions dealing with opening and parsing the basic contents of the IFF files.
def check_file(input_data, input_type):
"""
Does some basic checks of the file to make sure it's valid and includes special handling for the Mac version of the game. The IFF standard is from 1985, so it's not super robust...
Untested with some of the weirder versions of SC2k, such as Amiga, PocketPC/Windows Mobile, etc.
Currently only supports parsing for FORM and MIFF files.
Args:
input_data (bytes): bytes containing the entirety of the city.
input_type (str): type of input file, supported are 'mif' for .mif tileset/MIFF file and 'sc2' for .sc2 city file.
Returns:
A tuple containing a dictionary and the input.
The dictionary looks like {'type_id': header, 'data_size': reported_size, 'file_type': file_type} where the header is the opening 4 bytes of input as a bytestring, reported_size is an int of the size the file claims to be and file_type is one of b"SC2K" (tileset) of b"SCDH" (city).
Raises:
SC2Parse: an error relating to parsing .sc2 files. Could be caused by the file being a SimCity classic city (currently an unsupported format), not a city file at all, or being corrupted.
MIFFParse: an error relating to parsing .mif files. Could be caused by file corruption of not actually being a tileset file.
"""
# Check and convert if this is a Mac city file.
city_name = None
if mac_check(input_data):
input_data, city_name = mac_fix(input_data)
# This should be "FORM" for .sc2
header = input_data[0 : 4]
# The reported size saved in the .sc2, we don't count the first 8 bytes though, so we need to add them back.
reported_size = parse_int32(input_data[4 : 8]) + 8
# This should be "SCDH"
file_type = input_data[8 : 12]
# Actual size of our input file
actual_size = len(input_data)
# Check and see if this is a Simcity Classic city.
if input_type == 'sc2':
if header != b"FORM":
# Check and see if this is a Simcity Classic city.
if input_data[0x41 : 0x49] == b'\x43\x49\x54\x59\x4D\x43\x52\x50' and header[0 : 2] == b'\x00\x0d':
error_message = "Simcity Classic city files are not supported."
else:
error_message = f"Not a FORM type IFF file, claiming: {header}"
raise SC2Parse(error_message)
if reported_size != actual_size:
error_message = f"File reports being: {reported_size}B, but is actually {actual_size}B long."
raise SC2Parse(error_message)
if file_type != b"SCDH":
error_message = f"File type is not SCDH, claiming: {file_type}"
raise SC2Parse(error_message)
elif input_type == 'mif':
if header != b"MIFF":
error_message = f"Not a MIFF type IFF file, claiming: {header}"
raise MIFFParse(error_message)
if reported_size != actual_size:
error_message = f"File reports being: {reported_size}B, but is actually {actual_size}B long."
raise MIFFParse(error_message)
if file_type != b"SC2K":
error_message = f"File type is not SC2K, claiming: {file_type}"
raise MIFFParse(error_message)
return {'type_id': header, 'data_size': reported_size, 'file_type': file_type, "city_name": city_name}, input_data
def mac_check(input_data):
"""
Checks if this is a Mac .sc2 file.
Args:
input_data (bytes): raw city information.
Returns:
True if this is a Mac formatted file, False if it isn't.
"""
header = input_data[0 : 4]
mac_form = input_data[0x80 : 0x84]
if header != b"FORM" and mac_form == b"FORM":
return True
else:
return False
def mac_fix(input_data):
"""
Makes a Mac city file compatible with the Win95 version of the game.
Basically, we don't need the first 0x80 bytes from the Mac file, something about a resource fork. Also, some of the files have garbage data at the end, which is also trimmed.
Args:
input_data (bytes): raw city information.
Returns:
Bytes comprising a compatible SC2k Win95 city file from the Mac file, and the name of the city from the start of the file.
"""
reported_size = parse_int32(input_data[0x84 : 0x88]) + 8
name_len = input_data[1]
city_name = input_data[1 : 2 + name_len]
return input_data[0x80 : 0x80 + reported_size], city_name
# Functions to handle chunking up the IFF file.
def get_chunk_from_offset(input_data, offset):
"""
Parses an IFF chunk by reading the header and using the size to determine which bytes belong to it.
An IFF chunk has an 8 byte header, of which the first 4 bytes is the type and the second 4 bytes is the size (exclusive of the header).
Args:
input_data (bytes): raw city information.
offset (int): starting offset in input to start parsing at.
Returns:
A list containing the id of the chunk (a 4 byte ascii value), an int length of the chunk of finally bytes of the chunk data.
"""
location_index = offset
chunk_id = input_data[location_index : location_index + 4].decode('ascii')
# Maximum 32b/4B, so 2^32 in length.
chunk_size = parse_uint32(input_data[location_index + 4 : location_index + 8])
chunk_data = input_data[location_index + 8 : location_index + 8 + chunk_size]
return [chunk_id, chunk_size, chunk_data]
def get_chunk_from_name(input_data, section_name):
"""
Gets the specified chunk based on its name and parses it.
Warning!: This could be fragile if there's a sign with "XZON" in it, or similar.
Args:
input (bytes): raw city information.
section_name (str): ASCII convertible name of the IFF section/chunk id to get.
Returns:
A list containing the id of the chunk (a 4 byte ascii value), an int length of the chunk of finally bytes of the chunk data.
"""
return get_chunk_from_offset(input_data, input_data.index(bytes(section_name, 'ascii')))
def get_n_bytes(iterable, n, fill=None):
"""
Splits the input iterable up every n bytes, and pads it if it's shorter.
For example, if iterable is a list of 31 bytes and n is 8, there will be 4 resulting iterators, with the last entry in the last one being whatever value fill has.
Args:
iterable (bytes): raw city information.
n (int): number of bytes to split on.
fill (bytes): if iterable isn't cleanly divided by n, fill any elements up with this value.
Returns:
An iterator splitting the input up every n bytes, padded out to always be a multiple of n bytes long.
"""
args = [iter(iterable)] * n
output = itertools.zip_longest(*args, fillvalue=fill)
return output
# Functions to handle compression and decompression of the IFF file.
def uncompress_rle(encoded_data):
"""
Uncompresses the RLE compressed city data. For more information, consult the .sc2 file format specification documents at https://github.com/dfloer/SC2k-docs
Args:
encoded_data (bytes): raw city information.
Returns:
Uncompressed bytes.
"""
decoded_data = bytearray()
next_byte_repeat = False
byte_count = 0
# Data is stored in two forms: 0x01..0x7F and 0x81..0xFF
for byte in encoded_data:
if byte < 0x80 and byte_count == 0:
# In this case, byte is a count of the number of data bytes that follow.
byte_count = byte
next_byte_repeat = False
elif byte > 0x80 and byte_count == 0:
# In this case, byte-127=count of how many times the very next byte repeats.
byte_count = byte - 0x7f
next_byte_repeat = True
else:
if byte_count > 0 and next_byte_repeat:
decoded_data.extend([byte] * byte_count)
byte_count = 0
elif byte_count > 0 and not next_byte_repeat:
decoded_data.extend([byte])
byte_count -= 1
return decoded_data
def compress_rle(uncompressed_data):
"""
Compresses city data with a generally comparable algorithm as SC2k uses stock. See .sc2 file spec for full details at https://github.com/dfloer/SC2k-docs
Args:
uncompressed_data (bytes): Uncompressed data
Returns:
Compressed bytes.
"""
# Count the bytes we have, and create a tuple (to make sure ordering is preserved) of a count and then the number of bytes following.
counted_bytes = [(len(list(bytes_counted)), count) for count, bytes_counted in itertools.groupby(uncompressed_data)]
# Break runs up into the maximum number of consecutive bytes allowed.
counted_bytes2 = []
for c, b in counted_bytes:
full_runs = c // 128
leftover = c % 128
counted_bytes2 += [(128, b)] * full_runs + [(leftover, b)]
compressed_data = bytearray()
temp = bytearray()
offset = 0
for count, byte in counted_bytes2:
data = uncompressed_data[offset: offset + count]
if count == 1:
# The spec can't have more than 127 repeated bytes, so if we do, we need to start another run to encode.
if len(temp) == 127:
compressed_data.extend([len(temp)])
compressed_data.extend(temp)
temp = bytearray()
temp.extend(data)
else:
chunks = [data[x : x + 0x80] for x in range(0, len(data), 0x80)]
if len(temp) != 0:
compressed_data.extend([len(temp)])
compressed_data.extend(temp)
temp = bytearray()
for chunk in chunks:
compressed_data.extend([len(chunk) + 0x7f, byte])
offset += count
if len(temp) != 0:
compressed_data.extend([len(temp)])
compressed_data.extend(temp)
return compressed_data
def uncompress_rle_hex(encoded_data):
"""
Convenience function that generates a hex representation of the data. Useful for working around print() decoded binary data.
Args:
encoded_data (bytes): binary data.
Returns:
List of strings of hexadecimal representation of input data.
"""
return [hex(x) for x in encoded_data]
# Functions to handle decompression and compression of the actual city information.
def chunk_input_serial(input_file, input_type='sc2'):
"""
Takes already uncompressed city data and converts it into chunks.
Args:
input_file (bytes): raw uncompressed city data.
input_type (str): type of the input file we're opening.
Returns:
A dictionary of {chunk id: chunk data} form, one entry per chunk.
Raises:
SC2Parse: re-raised errors from check_file()
"""
output_dict = collections.OrderedDict()
try:
header, input_file = check_file(input_file, input_type)
except SC2Parse:
raise
file_length = header['data_size']
# -12B for the header
remaining_length = file_length - 12
if "CNAM" not in output_dict and header["city_name"] is not None:
output_dict["CNAM"] = header["city_name"]
while remaining_length > 0:
offset = file_length - remaining_length
chunk = get_chunk_from_offset(input_file, offset)
chunk_id = chunk[0]
chunk_data = chunk[2]
if chunk_id == "TEXT":
try:
output_dict[chunk_id] += [chunk_data]
except KeyError:
output_dict[chunk_id] = [chunk_data]
else:
output_dict[chunk_id] = chunk_data
# How much of the file still needs to be scanned? Subtract the size of the chunk's data and header from it.
remaining_length -= (chunk[1] + 8)
return output_dict
def sc2_uncompress_input(input_file, input_type='sc2'):
"""
Uncompresses a compressed .mif or .sc2 file.
For a .sc2 file, doesn't uncompress chunks with id of CNAM or ALTM and for .mif, soesn't uncompress TILE chunks.
Args:
input_file (bytes): compressed city data.
input_type (str): type of the input file we're opening.
Returns:
A dictionary of uncompressed {chunk id: chunk data} form, one entry per chunk.
"""
uncompressed_dict = collections.OrderedDict()
for k, v in input_file.items():
if input_type == 'sc2':
if k not in ("CNAM", "ALTM", "TEXT", "SCEN", "PICT"):
uncompressed_dict[k] = uncompress_rle(v)
elif k == "TEXT":
uncompressed_dict[k] = [bytearray(x) for x in v]
else:
uncompressed_dict[k] = bytearray(v)
elif input_type == 'mif':
if k != "TILE":
uncompressed_dict[k] = uncompress_rle(v)
else:
uncompressed_dict[k] = bytearray(v)
return uncompressed_dict
def mif_parse_tile(tile_data):
"""
Splits a .mif file into a list of tiles for further parsing.
Args:
tile_data (bytes): compressed city data:
Returns:
A list of tiles.
"""
output = []
file_length = len(tile_data)
remaining_length = file_length
while remaining_length > 0:
offset = file_length - remaining_length
chunk = get_chunk_from_offset(tile_data, offset)
output.append([chunk[0], chunk[2]])
#How much of the file still needs to be scanned? Subtract the size of the chunk's data and header from it.
remaining_length -= ((chunk[1]) + 8)
return output
def clean_city_name(dirty_name):
"""
Get's the city's name, if it exists. Sometimes CNAM contains garbage, so this also cleans that up.
Args:
dirty_name (bytes): City's name, possible with garbage in it.
Returns:
A string of the name, with garbage removed.
"""
clean_name = ""
dirty_name = dirty_name[1 : 32]
for x in dirty_name:
if x == 0x00:
break
clean_name += chr(x)
return str(clean_name)