Skip to content

Commit

Permalink
Merge pull request #344 from EverythingMe/feature/query_results_cleanup
Browse files Browse the repository at this point in the history
Job to cleanup unused query results
  • Loading branch information
arikfr committed Dec 25, 2014
2 parents 2278a18 + 07f4a1b commit 71b4b45
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 12 deletions.
1 change: 0 additions & 1 deletion rd_ui/Gruntfile.js
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ module.exports = function (grunt) {
// Automatically inject Bower components into the app
wiredep: {
options: {
cwd: '<%= yeoman.app %>'
},
app: {
src: ['<%= yeoman.app %>/index.html'],
Expand Down
1 change: 1 addition & 0 deletions redash/controllers.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def status_api():
status['version'] = __version__
status['queries_count'] = models.Query.select().count()
status['query_results_count'] = models.QueryResult.select().count()
status['unused_query_results_count'] = models.QueryResult.unused().count()
status['dashboards_count'] = models.Dashboard.select().count()
status['widgets_count'] = models.Widget.select().count()

Expand Down
9 changes: 9 additions & 0 deletions redash/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,15 @@ def to_dict(self):
'retrieved_at': self.retrieved_at
}

@classmethod
def unused(cls):
week_ago = datetime.datetime.now() - datetime.timedelta(days=7)

unused_results = cls.select().where(Query.id == None, cls.retrieved_at < week_ago)\
.join(Query, join_type=peewee.JOIN_LEFT_OUTER)

return unused_results

@classmethod
def get_latest(cls, data_source, query, ttl=0):
query_hash = utils.gen_query_hash(query)
Expand Down
4 changes: 4 additions & 0 deletions redash/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ def parse_boolean(str):
CELERY_BACKEND = os.environ.get("REDASH_CELERY_BACKEND", REDIS_URL)
CELERY_FLOWER_URL = os.environ.get("REDASH_CELERY_FLOWER_URL", "/flower")

# The following enables periodic job (every 5 minutes) of removing unused query results. Behind this "feature flag" until
# proved to be "safe".
QUERY_RESULTS_CLEANUP_ENABLED = parse_boolean(os.environ.get("REDASH_QUERY_RESULTS_CLEANUP_ENABLED", "false"))

# Google Apps domain to allow access from; any user with email in this Google Apps will be allowed
# access
GOOGLE_APPS_DOMAIN = os.environ.get("REDASH_GOOGLE_APPS_DOMAIN", "")
Expand Down
16 changes: 16 additions & 0 deletions redash/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,22 @@ def cleanup_tasks():
redis_connection.delete(lock_keys[i])


@celery.task(base=BaseTask)
def cleanup_query_results():
"""
Job to cleanup unused query results -- such that no query links to them anymore, and older than a week (so it's less
likely to be open in someone's browser and be used).
Each time the job deletes only 100 query results so it won't choke the database in case of many such results.
"""

unused_query_results = models.QueryResult.unused().limit(100)
total_unused_query_results = models.QueryResult.unused().count()
deleted_count = models.QueryResult.delete().where(models.QueryResult.id << unused_query_results).execute()

logger.info("Deleted %d unused query results out of total of %d." % (deleted_count, total_unused_query_results))


@celery.task(bind=True, base=BaseTask, track_started=True)
def execute_query(self, query, data_source_id):
# TODO: maybe this should be a class?
Expand Down
29 changes: 18 additions & 11 deletions redash/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,26 @@
broker=settings.CELERY_BROKER,
include='redash.tasks')

celery_schedule = {
'refresh_queries': {
'task': 'redash.tasks.refresh_queries',
'schedule': timedelta(seconds=30)
},
'cleanup_tasks': {
'task': 'redash.tasks.cleanup_tasks',
'schedule': timedelta(minutes=5)
}
}

if settings.QUERY_RESULTS_CLEANUP_ENABLED:
celery_schedule['cleanup_query_results'] = {
'task': 'redash.tasks.cleanup_query_results',
'schedule': timedelta(minutes=5)
}

celery.conf.update(CELERY_RESULT_BACKEND=settings.CELERY_BACKEND,
CELERYBEAT_SCHEDULE={
'refresh_queries': {
'task': 'redash.tasks.refresh_queries',
'schedule': timedelta(seconds=30)
},
'cleanup_tasks': {
'task': 'redash.tasks.cleanup_tasks',
'schedule': timedelta(minutes=5)
},
},
CELERYBEAT_SCHEDULE=celery_schedule,
CELERY_TIMEZONE='UTC')


if __name__ == '__main__':
celery.start()
19 changes: 19 additions & 0 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,25 @@ def test_get_latest_returns_the_last_cached_result_for_negative_ttl(self):
self.assertEqual(found_query_result.id, qr.id)


class TestUnusedQueryResults(BaseTestCase):
def test_returns_only_unused_query_results(self):
two_weeks_ago = datetime.datetime.now() - datetime.timedelta(days=14)
qr = query_result_factory.create()
query = query_factory.create(latest_query_data=qr)
unused_qr = query_result_factory.create(retrieved_at=two_weeks_ago)

self.assertIn(unused_qr, models.QueryResult.unused())
self.assertNotIn(qr, models.QueryResult.unused())

def test_returns_only_over_a_week_old_results(self):
two_weeks_ago = datetime.datetime.now() - datetime.timedelta(days=14)
unused_qr = query_result_factory.create(retrieved_at=two_weeks_ago)
new_unused_qr = query_result_factory.create()

self.assertIn(unused_qr, models.QueryResult.unused())
self.assertNotIn(new_unused_qr, models.QueryResult.unused())


class TestQueryResultStoreResult(BaseTestCase):
def setUp(self):
super(TestQueryResultStoreResult, self).setUp()
Expand Down

0 comments on commit 71b4b45

Please sign in to comment.