From e116e88e98bc737978d03894f524664d5aee070a Mon Sep 17 00:00:00 2001 From: Arik Fraimovich Date: Thu, 25 Dec 2014 15:39:49 +0200 Subject: [PATCH 1/2] Job to cleanup unused query results --- redash/controllers.py | 1 + redash/models.py | 9 +++++++++ redash/settings.py | 4 ++++ redash/tasks.py | 16 ++++++++++++++++ redash/worker.py | 29 ++++++++++++++++++----------- tests/test_models.py | 19 +++++++++++++++++++ 6 files changed, 67 insertions(+), 11 deletions(-) diff --git a/redash/controllers.py b/redash/controllers.py index 955257f343..82c96bb624 100644 --- a/redash/controllers.py +++ b/redash/controllers.py @@ -100,6 +100,7 @@ def status_api(): status['version'] = __version__ status['queries_count'] = models.Query.select().count() status['query_results_count'] = models.QueryResult.select().count() + status['unused_query_results_count'] = models.QueryResult.unused().count() status['dashboards_count'] = models.Dashboard.select().count() status['widgets_count'] = models.Widget.select().count() diff --git a/redash/models.py b/redash/models.py index 92403f13f2..d143f68266 100644 --- a/redash/models.py +++ b/redash/models.py @@ -225,6 +225,15 @@ def to_dict(self): 'retrieved_at': self.retrieved_at } + @classmethod + def unused(cls): + week_ago = datetime.datetime.now() - datetime.timedelta(days=7) + + unused_results = cls.select().where(Query.id == None, cls.retrieved_at < week_ago)\ + .join(Query, join_type=peewee.JOIN_LEFT_OUTER) + + return unused_results + @classmethod def get_latest(cls, data_source, query, ttl=0): query_hash = utils.gen_query_hash(query) diff --git a/redash/settings.py b/redash/settings.py index a3d8b8bb86..68f328c150 100644 --- a/redash/settings.py +++ b/redash/settings.py @@ -56,6 +56,10 @@ def parse_boolean(str): CELERY_BACKEND = os.environ.get("REDASH_CELERY_BACKEND", REDIS_URL) CELERY_FLOWER_URL = os.environ.get("REDASH_CELERY_FLOWER_URL", "/flower") +# The following enables periodic job (every 5 minutes) of removing unused query results. Behind this "feature flag" until +# proved to be "safe". +QUERY_RESULTS_CLEANUP_ENABLED = parse_boolean(os.environ.get("REDASH_QUERY_RESULTS_CLEANUP_ENABLED", "false")) + # Google Apps domain to allow access from; any user with email in this Google Apps will be allowed # access GOOGLE_APPS_DOMAIN = os.environ.get("REDASH_GOOGLE_APPS_DOMAIN", "") diff --git a/redash/tasks.py b/redash/tasks.py index 373385da20..7a7bf25b11 100644 --- a/redash/tasks.py +++ b/redash/tasks.py @@ -202,6 +202,22 @@ def cleanup_tasks(): redis_connection.delete(lock_keys[i]) +@celery.task(base=BaseTask) +def cleanup_query_results(): + """ + Job to cleanup unused query results -- such that no query links to them anymore, and older than a week (so it's less + likely to be open in someone's browser and be used). + + Each time the job deletes only 100 query results so it won't choke the database in case of many such results. + """ + + unused_query_results = models.QueryResult.unused().limit(100) + total_unused_query_results = models.QueryResult.unused().count() + deleted_count = models.QueryResult.delete().where(models.QueryResult.id << unused_query_results).execute() + + logger.info("Deleted %d unused query results out of total of %d." % (deleted_count, total_unused_query_results)) + + @celery.task(bind=True, base=BaseTask, track_started=True) def execute_query(self, query, data_source_id): # TODO: maybe this should be a class? diff --git a/redash/worker.py b/redash/worker.py index c2d6481f3f..e9ea0098ea 100644 --- a/redash/worker.py +++ b/redash/worker.py @@ -7,19 +7,26 @@ broker=settings.CELERY_BROKER, include='redash.tasks') +celery_schedule = { + 'refresh_queries': { + 'task': 'redash.tasks.refresh_queries', + 'schedule': timedelta(seconds=30) + }, + 'cleanup_tasks': { + 'task': 'redash.tasks.cleanup_tasks', + 'schedule': timedelta(minutes=5) + } +} + +if settings.QUERY_RESULTS_CLEANUP_ENABLED: + celery_schedule['cleanup_query_results'] = { + 'task': 'redash.tasks.cleanup_query_results', + 'schedule': timedelta(minutes=5) + } + celery.conf.update(CELERY_RESULT_BACKEND=settings.CELERY_BACKEND, - CELERYBEAT_SCHEDULE={ - 'refresh_queries': { - 'task': 'redash.tasks.refresh_queries', - 'schedule': timedelta(seconds=30) - }, - 'cleanup_tasks': { - 'task': 'redash.tasks.cleanup_tasks', - 'schedule': timedelta(minutes=5) - }, - }, + CELERYBEAT_SCHEDULE=celery_schedule, CELERY_TIMEZONE='UTC') - if __name__ == '__main__': celery.start() \ No newline at end of file diff --git a/tests/test_models.py b/tests/test_models.py index ab48350ddd..2f5a73c745 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -129,6 +129,25 @@ def test_get_latest_returns_the_last_cached_result_for_negative_ttl(self): self.assertEqual(found_query_result.id, qr.id) +class TestUnusedQueryResults(BaseTestCase): + def test_returns_only_unused_query_results(self): + two_weeks_ago = datetime.datetime.now() - datetime.timedelta(days=14) + qr = query_result_factory.create() + query = query_factory.create(latest_query_data=qr) + unused_qr = query_result_factory.create(retrieved_at=two_weeks_ago) + + self.assertIn(unused_qr, models.QueryResult.unused()) + self.assertNotIn(qr, models.QueryResult.unused()) + + def test_returns_only_over_a_week_old_results(self): + two_weeks_ago = datetime.datetime.now() - datetime.timedelta(days=14) + unused_qr = query_result_factory.create(retrieved_at=two_weeks_ago) + new_unused_qr = query_result_factory.create() + + self.assertIn(unused_qr, models.QueryResult.unused()) + self.assertNotIn(new_unused_qr, models.QueryResult.unused()) + + class TestQueryResultStoreResult(BaseTestCase): def setUp(self): super(TestQueryResultStoreResult, self).setUp() From 07f4a1b227faa695e56caa274e1181ee1856e225 Mon Sep 17 00:00:00 2001 From: Arik Fraimovich Date: Thu, 25 Dec 2014 15:52:52 +0200 Subject: [PATCH 2/2] Fix: wiredep failing after version upgrade --- rd_ui/Gruntfile.js | 1 - 1 file changed, 1 deletion(-) diff --git a/rd_ui/Gruntfile.js b/rd_ui/Gruntfile.js index 87f29a3ec8..37e08eb35b 100644 --- a/rd_ui/Gruntfile.js +++ b/rd_ui/Gruntfile.js @@ -163,7 +163,6 @@ module.exports = function (grunt) { // Automatically inject Bower components into the app wiredep: { options: { - cwd: '<%= yeoman.app %>' }, app: { src: ['<%= yeoman.app %>/index.html'],