From 80ac0a52ee77207e00473e09589a422ab03b26d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20St=C3=B6r?= Date: Sun, 17 Mar 2019 23:43:05 +0100 Subject: [PATCH] Drop non-recurring events immediately if out-of-range Fixes #207 --- README.md | 40 ++++++++++++++++++++- src/ICal/ICal.php | 90 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 114 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 0dcba97..45fddea 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ - PHP 5 (≥ 5.3.0) - [Valid ICS](https://icalendar.org/validator.html) (`.ics`, `.ical`, `.ifb`) file - [IANA](https://www.iana.org/time-zones), [Unicode CLDR](http://cldr.unicode.org/translation/timezones) or [Windows](https://support.microsoft.com/en-ca/help/973627/microsoft-time-zone-index-values) Time Zones - - Windows will need the `$replaceWindowsTimeZoneIds = true` configuration + - Windows Time Zones will need the `$replaceWindowsTimeZoneIds = true` configuration ### Setup @@ -66,6 +66,44 @@ --- +## On parsing iCal +Parsing [iCal/iCalendar/ICS](https://en.wikipedia.org/wiki/ICalendar) resources poses several challenges. One is that +the specification is a moving target; the original RFC was updated four times in ten years. The other is that vendors +were both liberal (read: creative) in interpreting the specification and productive implementing proprietary extensions. + +However, what impedes efficient parsing most directly are recurrence rules for events. This library parses the original +calendar into an easy to work with memory model. This requires that each recurring event is expanded or exploded. Hence, +a single event that occurs daily will generate a new event instance for every day when this parser processes the +calendar ([`$defaultSpan`](#variables) limits this). To get an idea how this is done take a look at the +[call graph](https://user-images.githubusercontent.com/624195/45904641-f3cd0a80-bded-11e8-925f-7bcee04b8575.png). + +As a consequence the _entire_ calendar is parsed line-by-line, and thus loaded into memory, first. As you can imagine +large calendars tend to get huge when exploded i.e. with all their recurrence rules evaluated. This is exacerbated when +old calendars do not remove past events as they get fatter and fatter every year. + +This limitation is particularly painful if you only need a window into the original calendar. It seems wasteful to parse +the entire fully exploded calendar into memory if you later are going to call the +[`eventsFromInterval()` or `eventsFromRange()`](#methods) on it. + +In late 2018 [#190](https://github.com/u01jmg3/ics-parser/pull/190) added the option to drop all events outside a given +range very early in the parse process at the cost of some precision (timezone calculations not done at that point). This +massively reduces the total time for parsing a calendar. Same goes for memory consumption of course. Precondition is that +you know upfront that you don't care about events outside a given range. + +Let's say your are only interested in events from yesterday, today and tomorrow. To compensate for the fact that the +tricky timezone transformations and calculations have not been executed yet by the time the parser has to decide whether +to keep or drop an event you tell it to filter for **+-2d** instead of +-1d. Once it is done you would then call +`eventsFromRange()` with +-1d to get precisely the events in the window you are interested in. That is what the variables +[`$filterDaysBefore` and `$filterDaysAfter`]((#variables)) are for. + +In Q1 2019 [#213](https://github.com/u01jmg3/ics-parser/pull/213) further improved the performance by immediately +dropping _non-recurring_ events once parsed if they are outside that fuzzy window. This greatly reduces the maximum +memory consumption for large calendars. PHP by default does not allocate more than 128MB heap and would otherwise crash +with `Fatal error: Allowed memory size of 134217728 bytes exhausted`. It goes without saying that recurring events first +need to be evaluated before non-fitting events can be dropped. + +--- + ## API ### `ICal` API diff --git a/src/ICal/ICal.php b/src/ICal/ICal.php index e83a4f6..86caeba 100644 --- a/src/ICal/ICal.php +++ b/src/ICal/ICal.php @@ -410,6 +410,22 @@ class ICal */ private $windowsTimeZonesIana; + /** + * If $filterDaysBefore or $filterDaysBefore are set then the events are filtered according to the window defined + * by the below two fields. + * + * @var int + */ + private $windowMinTimestamp; + private $windowMaxTimestamp; + + /** + * True if either $filterDaysBefore or $filterDaysAfter are set. + * + * @var boolean + */ + private $shouldFilterByWindow; + /** * Creates the ICal object * @@ -435,6 +451,14 @@ public function __construct($files = false, array $options = array()) $this->windowsTimeZones = array_keys(self::$windowsTimeZonesMap); $this->windowsTimeZonesIana = array_values(self::$windowsTimeZonesMap); + // Ideally you would use `PHP_INT_MIN` from PHP 7 + $php_int_min = -2147483648; + + $this->windowMinTimestamp = is_null($this->filterDaysBefore) ? $php_int_min : (new \DateTime('now'))->sub(new \DateInterval('P' . $this->filterDaysBefore . 'D'))->getTimestamp(); + $this->windowMaxTimestamp = is_null($this->filterDaysAfter) ? PHP_INT_MAX : (new \DateTime('now'))->add(new \DateInterval('P' . $this->filterDaysAfter . 'D'))->getTimestamp(); + + $this->shouldFilterByWindow = !is_null($this->filterDaysBefore) || !is_null($this->filterDaysAfter); + if ($files !== false) { $files = is_array($files) ? $files : array($files); @@ -604,13 +628,19 @@ protected function initLines(array $lines) case 'END:DAYLIGHT': case 'END:STANDARD': case 'END:VCALENDAR': - case 'END:VEVENT': case 'END:VFREEBUSY': case 'END:VTIMEZONE': case 'END:VTODO': $component = 'VCALENDAR'; break; + case 'END:VEVENT': + if ($this->shouldFilterByWindow) { + $this->removeLastEventIfOutsideWindowAndNonRecurring(); + } + $component = 'VCALENDAR'; + break; + default: $this->addCalendarComponentWithKeyAndValue($component, $keyword, $value); break; @@ -639,7 +669,7 @@ protected function initLines(array $lines) } } - if (!is_null($this->filterDaysBefore) || !is_null($this->filterDaysAfter)) { + if ($this->shouldFilterByWindow) { $this->reduceEventsToMinMaxRange(); } @@ -647,6 +677,30 @@ protected function initLines(array $lines) } } + /** + * Removes the last event (i.e. most recently parsed) if its start date is outside the window spanned by + * windowMinTimestamp/windowMaxTimestamp. + * + * @throws \Exception + */ + private function removeLastEventIfOutsideWindowAndNonRecurring() + { + $events = $this->cal['VEVENT']; + + if (!empty($events)) { + + $lastIndex = sizeof($events) - 1; + $lastEvent = $events[$lastIndex]; + + if (!isset($lastEvent['RRULE']) || $lastEvent['RRULE'] === '' && $this->isEventStartOutsideWindow($lastEvent)) { + $this->eventCount--; + unset($events[$lastIndex]); + } + + $this->cal['VEVENT'] = $events; + } + } + /** * Reduces the number of events to the defined minimum and maximum range * @@ -657,16 +711,9 @@ protected function reduceEventsToMinMaxRange() $events = (isset($this->cal['VEVENT'])) ? $this->cal['VEVENT'] : array(); if (!empty($events)) { - // Ideally you would use `PHP_INT_MIN` from PHP 7 - $php_int_min = -2147483648; - - $minTimestamp = is_null($this->filterDaysBefore) ? $php_int_min : (new \DateTime('now'))->sub(new \DateInterval('P' . $this->filterDaysBefore . 'D'))->getTimestamp(); - $maxTimestamp = is_null($this->filterDaysAfter) ? PHP_INT_MAX : (new \DateTime('now'))->add(new \DateInterval('P' . $this->filterDaysAfter . 'D'))->getTimestamp(); - foreach ($events as $key => $anEvent) { - if (!$this->isValidDate($anEvent['DTSTART']) || $this->isOutOfRange($anEvent['DTSTART'], $minTimestamp, $maxTimestamp)) { + if ($this->isEventStartOutsideWindow($anEvent)) { $this->eventCount--; - unset($events[$key]); continue; @@ -678,18 +725,31 @@ protected function reduceEventsToMinMaxRange() } /** - * Determines whether an event's start time is within a given range + * Determines whether the event start date is outside the windowMinTimestamp/windowMaxTimestamp. Returns true for + * invalid dates. + * + * @param $event + * @return boolean + * @throws \Exception + */ + private function isEventStartOutsideWindow($event) + { + return !$this->isValidDate($event['DTSTART']) || $this->isOutOfRange($event['DTSTART'], $this->windowMinTimestamp, $this->windowMaxTimestamp); + } + + /** + * Determines whether a valid iCalendar date is within a given range * - * @param string $eventStart + * @param string $calendarDate * @param integer $minTimestamp * @param integer $maxTimestamp * @return boolean */ - protected function isOutOfRange($eventStart, $minTimestamp, $maxTimestamp) + protected function isOutOfRange($calendarDate, $minTimestamp, $maxTimestamp) { - $eventStartTimestamp = strtotime(explode('T', $eventStart)[0]); + $timestamp = strtotime(explode('T', $calendarDate)[0]); - return $eventStartTimestamp < $minTimestamp || $eventStartTimestamp > $maxTimestamp; + return $timestamp < $minTimestamp || $timestamp > $maxTimestamp; } /**