Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Job to cleanup unused query results #344

Merged
merged 2 commits into from
Dec 25, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion rd_ui/Gruntfile.js
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ module.exports = function (grunt) {
// Automatically inject Bower components into the app
wiredep: {
options: {
cwd: '<%= yeoman.app %>'
},
app: {
src: ['<%= yeoman.app %>/index.html'],
Expand Down
1 change: 1 addition & 0 deletions redash/controllers.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def status_api():
status['version'] = __version__
status['queries_count'] = models.Query.select().count()
status['query_results_count'] = models.QueryResult.select().count()
status['unused_query_results_count'] = models.QueryResult.unused().count()
status['dashboards_count'] = models.Dashboard.select().count()
status['widgets_count'] = models.Widget.select().count()

Expand Down
9 changes: 9 additions & 0 deletions redash/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,15 @@ def to_dict(self):
'retrieved_at': self.retrieved_at
}

@classmethod
def unused(cls):
week_ago = datetime.datetime.now() - datetime.timedelta(days=7)

unused_results = cls.select().where(Query.id == None, cls.retrieved_at < week_ago)\
.join(Query, join_type=peewee.JOIN_LEFT_OUTER)

return unused_results

@classmethod
def get_latest(cls, data_source, query, ttl=0):
query_hash = utils.gen_query_hash(query)
Expand Down
4 changes: 4 additions & 0 deletions redash/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ def parse_boolean(str):
CELERY_BACKEND = os.environ.get("REDASH_CELERY_BACKEND", REDIS_URL)
CELERY_FLOWER_URL = os.environ.get("REDASH_CELERY_FLOWER_URL", "/flower")

# The following enables periodic job (every 5 minutes) of removing unused query results. Behind this "feature flag" until
# proved to be "safe".
QUERY_RESULTS_CLEANUP_ENABLED = parse_boolean(os.environ.get("REDASH_QUERY_RESULTS_CLEANUP_ENABLED", "false"))

# Google Apps domain to allow access from; any user with email in this Google Apps will be allowed
# access
GOOGLE_APPS_DOMAIN = os.environ.get("REDASH_GOOGLE_APPS_DOMAIN", "")
Expand Down
16 changes: 16 additions & 0 deletions redash/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,22 @@ def cleanup_tasks():
redis_connection.delete(lock_keys[i])


@celery.task(base=BaseTask)
def cleanup_query_results():
"""
Job to cleanup unused query results -- such that no query links to them anymore, and older than a week (so it's less
likely to be open in someone's browser and be used).

Each time the job deletes only 100 query results so it won't choke the database in case of many such results.
"""

unused_query_results = models.QueryResult.unused().limit(100)
total_unused_query_results = models.QueryResult.unused().count()
deleted_count = models.QueryResult.delete().where(models.QueryResult.id << unused_query_results).execute()

logger.info("Deleted %d unused query results out of total of %d." % (deleted_count, total_unused_query_results))


@celery.task(bind=True, base=BaseTask, track_started=True)
def execute_query(self, query, data_source_id):
# TODO: maybe this should be a class?
Expand Down
29 changes: 18 additions & 11 deletions redash/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,26 @@
broker=settings.CELERY_BROKER,
include='redash.tasks')

celery_schedule = {
'refresh_queries': {
'task': 'redash.tasks.refresh_queries',
'schedule': timedelta(seconds=30)
},
'cleanup_tasks': {
'task': 'redash.tasks.cleanup_tasks',
'schedule': timedelta(minutes=5)
}
}

if settings.QUERY_RESULTS_CLEANUP_ENABLED:
celery_schedule['cleanup_query_results'] = {
'task': 'redash.tasks.cleanup_query_results',
'schedule': timedelta(minutes=5)
}

celery.conf.update(CELERY_RESULT_BACKEND=settings.CELERY_BACKEND,
CELERYBEAT_SCHEDULE={
'refresh_queries': {
'task': 'redash.tasks.refresh_queries',
'schedule': timedelta(seconds=30)
},
'cleanup_tasks': {
'task': 'redash.tasks.cleanup_tasks',
'schedule': timedelta(minutes=5)
},
},
CELERYBEAT_SCHEDULE=celery_schedule,
CELERY_TIMEZONE='UTC')


if __name__ == '__main__':
celery.start()
19 changes: 19 additions & 0 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,25 @@ def test_get_latest_returns_the_last_cached_result_for_negative_ttl(self):
self.assertEqual(found_query_result.id, qr.id)


class TestUnusedQueryResults(BaseTestCase):
def test_returns_only_unused_query_results(self):
two_weeks_ago = datetime.datetime.now() - datetime.timedelta(days=14)
qr = query_result_factory.create()
query = query_factory.create(latest_query_data=qr)
unused_qr = query_result_factory.create(retrieved_at=two_weeks_ago)

self.assertIn(unused_qr, models.QueryResult.unused())
self.assertNotIn(qr, models.QueryResult.unused())

def test_returns_only_over_a_week_old_results(self):
two_weeks_ago = datetime.datetime.now() - datetime.timedelta(days=14)
unused_qr = query_result_factory.create(retrieved_at=two_weeks_ago)
new_unused_qr = query_result_factory.create()

self.assertIn(unused_qr, models.QueryResult.unused())
self.assertNotIn(new_unused_qr, models.QueryResult.unused())


class TestQueryResultStoreResult(BaseTestCase):
def setUp(self):
super(TestQueryResultStoreResult, self).setUp()
Expand Down