Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support parsing tz timezone data #10456

Merged
merged 5 commits into from
Jan 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ pub fn build(b: *Builder) !void {
".z.9",
".gz",
"rfc1951.txt",
".tzif",
},
.blank_extensions = &[_][]const u8{
"test.zig",
Expand Down
1 change: 1 addition & 0 deletions lib/std/std.zig
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ pub const StringArrayHashMapUnmanaged = array_hash_map.StringArrayHashMapUnmanag
pub const TailQueue = @import("linked_list.zig").TailQueue;
pub const Target = @import("target.zig").Target;
pub const Thread = @import("Thread.zig");
pub const Tz = @import("tz.zig").Tz;

pub const array_hash_map = @import("array_hash_map.zig");
pub const atomic = @import("atomic.zig");
Expand Down
252 changes: 252 additions & 0 deletions lib/std/tz.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
const std = @import("std.zig");
const builtin = @import("builtin");

pub const Transition = struct {
ts: i64,
timetype: *Timetype,
};

pub const Timetype = struct {
offset: i32,
flags: u8,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why have this as flags instead of bool fields for each?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes Timetype pack better; 12 bytes instead of 16. I can change it if the optimization isn't worth the complexity of having getters for each value.

name_data: [6:0]u8,

pub fn name(self: Timetype) [:0]const u8 {
return std.mem.sliceTo(self.name_data[0..], 0);
}

pub fn isDst(self: Timetype) bool {
return (self.flags & 0x01) > 0;
}

pub fn standardTimeIndicator(self: Timetype) bool {
return (self.flags & 0x02) > 0;
}

pub fn utIndicator(self: Timetype) bool {
return (self.flags & 0x04) > 0;
}
};

pub const Leapsecond = struct {
occurrence: i48,
correction: i16,
};

pub const Tz = struct {
allocator: std.mem.Allocator,
transitions: []const Transition,
timetypes: []const Timetype,
leapseconds: []const Leapsecond,
footer: ?[]const u8,

const Header = extern struct {
magic: [4]u8,
version: u8,
reserved: [15]u8,
counts: extern struct {
isutcnt: u32,
isstdcnt: u32,
leapcnt: u32,
timecnt: u32,
typecnt: u32,
charcnt: u32,
},
};

pub fn parse(allocator: std.mem.Allocator, reader: anytype) !Tz {
var legacy_header = try reader.readStruct(Header);
if (!std.mem.eql(u8, &legacy_header.magic, "TZif")) return error.BadHeader;
if (legacy_header.version != 0 and legacy_header.version != '2' and legacy_header.version != '3') return error.BadVersion;

if (builtin.target.cpu.arch.endian() != std.builtin.Endian.Big) {
std.mem.bswapAllFields(@TypeOf(legacy_header.counts), &legacy_header.counts);
}

if (legacy_header.version == 0) {
return parseBlock(allocator, reader, legacy_header, true);
} else {
// If the format is modern, just skip over the legacy data
const skipv = legacy_header.counts.timecnt * 5 + legacy_header.counts.typecnt * 6 + legacy_header.counts.charcnt + legacy_header.counts.leapcnt * 8 + legacy_header.counts.isstdcnt + legacy_header.counts.isutcnt;
try reader.skipBytes(skipv, .{});

var header = try reader.readStruct(Header);
if (!std.mem.eql(u8, &header.magic, "TZif")) return error.BadHeader;
if (header.version != '2' and header.version != '3') return error.BadVersion;
if (builtin.target.cpu.arch.endian() != std.builtin.Endian.Big) {
std.mem.bswapAllFields(@TypeOf(header.counts), &header.counts);
}

return parseBlock(allocator, reader, header, false);
}
}

fn parseBlock(allocator: std.mem.Allocator, reader: anytype, header: Header, legacy: bool) !Tz {
if (header.counts.isstdcnt != 0 and header.counts.isstdcnt != header.counts.typecnt) return error.Malformed; // rfc8536: isstdcnt [...] MUST either be zero or equal to "typecnt"
if (header.counts.isutcnt != 0 and header.counts.isutcnt != header.counts.typecnt) return error.Malformed; // rfc8536: isutcnt [...] MUST either be zero or equal to "typecnt"
if (header.counts.typecnt == 0) return error.Malformed; // rfc8536: typecnt [...] MUST NOT be zero
if (header.counts.charcnt == 0) return error.Malformed; // rfc8536: charcnt [...] MUST NOT be zero
if (header.counts.charcnt > 256 + 6) return error.Malformed; // Not explicitly banned by rfc8536 but nonsensical

var leapseconds = try allocator.alloc(Leapsecond, header.counts.leapcnt);
errdefer allocator.free(leapseconds);
var transitions = try allocator.alloc(Transition, header.counts.timecnt);
errdefer allocator.free(transitions);
var timetypes = try allocator.alloc(Timetype, header.counts.typecnt);
errdefer allocator.free(timetypes);

// Parse transition types
var i: usize = 0;
while (i < header.counts.timecnt) : (i += 1) {
transitions[i].ts = if (legacy) try reader.readIntBig(i32) else try reader.readIntBig(i64);
}

i = 0;
while (i < header.counts.timecnt) : (i += 1) {
const tt = try reader.readByte();
if (tt >= timetypes.len) return error.Malformed; // rfc8536: Each type index MUST be in the range [0, "typecnt" - 1]
transitions[i].timetype = &timetypes[tt];
}

// Parse time types
i = 0;
while (i < header.counts.typecnt) : (i += 1) {
const offset = try reader.readIntBig(i32);
if (offset < -2147483648) return error.Malformed; // rfc8536: utoff [...] MUST NOT be -2**31
const dst = try reader.readByte();
if (dst != 0 and dst != 1) return error.Malformed; // rfc8536: (is)dst [...] The value MUST be 0 or 1.
const idx = try reader.readByte();
if (idx > header.counts.charcnt - 1) return error.Malformed; // rfc8536: (desig)idx [...] Each index MUST be in the range [0, "charcnt" - 1]
timetypes[i] = .{
.offset = offset,
.flags = dst,
.name_data = undefined,
};

// Temporarily cache idx in name_data to be processed after we've read the designator names below
timetypes[i].name_data[0] = idx;
}

var designators_data: [256 + 6]u8 = undefined;
try reader.readNoEof(designators_data[0..header.counts.charcnt]);
const designators = designators_data[0..header.counts.charcnt];
if (designators[designators.len - 1] != 0) return error.Malformed; // rfc8536: charcnt [...] includes the trailing NUL (0x00) octet

// Iterate through the timetypes again, setting the designator names
for (timetypes) |*tt| {
const name = std.mem.sliceTo(designators[tt.name_data[0]..], 0);
// We are mandating the "SHOULD" 6-character limit so we can pack the struct better, and to conform to POSIX.
if (name.len > 6) return error.Malformed; // rfc8536: Time zone designations SHOULD consist of at least three (3) and no more than six (6) ASCII characters.
std.mem.copy(u8, tt.name_data[0..], name);
tt.name_data[name.len] = 0;
}

// Parse leap seconds
i = 0;
while (i < header.counts.leapcnt) : (i += 1) {
const occur: i64 = if (legacy) try reader.readIntBig(i32) else try reader.readIntBig(i64);
if (occur < 0) return error.Malformed; // rfc8536: occur [...] MUST be nonnegative
if (i > 0 and leapseconds[i - 1].occurrence + 2419199 > occur) return error.Malformed; // rfc8536: occur [...] each later value MUST be at least 2419199 greater than the previous value
if (occur > std.math.maxInt(i48)) return error.Malformed; // Unreasonably far into the future

const corr = try reader.readIntBig(i32);
if (i == 0 and corr != -1 and corr != 1) return error.Malformed; // rfc8536: The correction value in the first leap-second record, if present, MUST be either one (1) or minus one (-1)
if (i > 0 and leapseconds[i - 1].correction != corr + 1 and leapseconds[i - 1].correction != corr - 1) return error.Malformed; // rfc8536: The correction values in adjacent leap-second records MUST differ by exactly one (1)
if (corr > std.math.maxInt(i16)) return error.Malformed; // Unreasonably large correction

leapseconds[i] = .{
.occurrence = @intCast(i48, occur),
.correction = @intCast(i16, corr),
};
}

// Parse standard/wall indicators
i = 0;
while (i < header.counts.isstdcnt) : (i += 1) {
const stdtime = try reader.readByte();
if (stdtime == 1) {
timetypes[i].flags |= 0x02;
}
}

// Parse UT/local indicators
i = 0;
while (i < header.counts.isutcnt) : (i += 1) {
const ut = try reader.readByte();
if (ut == 1) {
timetypes[i].flags |= 0x04;
if (!timetypes[i].standardTimeIndicator()) return error.Malformed; // rfc8536: standard/wall value MUST be one (1) if the UT/local value is one (1)
}
}

// Footer
var footer: ?[]u8 = null;
if (!legacy) {
if ((try reader.readByte()) != '\n') return error.Malformed; // An rfc8536 footer must start with a newline
var footerdata_buf: [128]u8 = undefined;
const footer_mem = reader.readUntilDelimiter(&footerdata_buf, '\n') catch |err| switch (err) {
error.StreamTooLong => return error.OverlargeFooter, // Read more than 128 bytes, much larger than any reasonable POSIX TZ string
else => return err,
};
if (footer_mem.len != 0) {
footer = try allocator.dupe(u8, footer_mem);
}
}
errdefer if (footer) |ft| allocator.free(ft);

return Tz{
.allocator = allocator,
.transitions = transitions,
.timetypes = timetypes,
.leapseconds = leapseconds,
.footer = footer,
};
}

pub fn deinit(self: *Tz) void {
if (self.footer) |footer| {
self.allocator.free(footer);
}
self.allocator.free(self.leapseconds);
self.allocator.free(self.transitions);
self.allocator.free(self.timetypes);
}
};

test "slim" {
const data = @embedFile("tz/asia_tokyo.tzif");
Aransentin marked this conversation as resolved.
Show resolved Hide resolved
var in_stream = std.io.fixedBufferStream(data);

var tz = try std.Tz.parse(std.testing.allocator, in_stream.reader());
defer tz.deinit();

try std.testing.expectEqual(tz.transitions.len, 9);
try std.testing.expect(std.mem.eql(u8, tz.transitions[3].timetype.name(), "JDT"));
try std.testing.expectEqual(tz.transitions[5].ts, -620298000); // 1950-05-06 15:00:00 UTC
try std.testing.expectEqual(tz.leapseconds[13].occurrence, 567993613); // 1988-01-01 00:00:00 UTC (+23s in TAI, and +13 in the data since it doesn't store the initial 10 second offset)
}

test "fat" {
const data = @embedFile("tz/antarctica_davis.tzif");
var in_stream = std.io.fixedBufferStream(data);

var tz = try std.Tz.parse(std.testing.allocator, in_stream.reader());
defer tz.deinit();

try std.testing.expectEqual(tz.transitions.len, 8);
try std.testing.expect(std.mem.eql(u8, tz.transitions[3].timetype.name(), "+05"));
try std.testing.expectEqual(tz.transitions[4].ts, 1268251224); // 2010-03-10 20:00:00 UTC
}

test "legacy" {
// Taken from Slackware 8.0, from 2001
const data = @embedFile("tz/europe_vatican.tzif");
var in_stream = std.io.fixedBufferStream(data);

var tz = try std.Tz.parse(std.testing.allocator, in_stream.reader());
defer tz.deinit();

try std.testing.expectEqual(tz.transitions.len, 170);
try std.testing.expect(std.mem.eql(u8, tz.transitions[69].timetype.name(), "CET"));
try std.testing.expectEqual(tz.transitions[123].ts, 1414285200); // 2014-10-26 01:00:00 UTC
}
Binary file added lib/std/tz/antarctica_davis.tzif
Binary file not shown.
Binary file added lib/std/tz/asia_tokyo.tzif
Binary file not shown.
Binary file added lib/std/tz/europe_vatican.tzif
Binary file not shown.