Skip to content

[SNOW-172] Convert file_latest to dynamic table #110

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
-- Introduce the dynamic table
USE SCHEMA {{database_name}}.synapse; --noqa: JJ01,PRS,TMP
CREATE OR REPLACE DYNAMIC TABLE FILE_LATEST
TARGET_LAG = '1 day'
WAREHOUSE = compute_xsmall
AS
WITH dedup_filesnapshots AS (
SELECT
*
FROM {{database_name}}.SYNAPSE_RAW.FILESNAPSHOTS --noqa: TMP
WHERE
SNAPSHOT_DATE >= CURRENT_TIMESTAMP - INTERVAL '30 days'
QUALIFY
ROW_NUMBER() OVER (
PARTITION BY ID
ORDER BY CHANGE_TIMESTAMP DESC, SNAPSHOT_TIMESTAMP DESC
) = 1
)
SELECT
*
FROM
dedup_filesnapshots;
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
-- Add table and column comments to userprofile_latest dynamic table
USE SCHEMA {{database_name}}.synapse; --noqa: JJ01,PRS,TMP
-- Table comments
COMMENT ON DYNAMIC TABLE FILE_LATEST IS 'This dynamic table contains the latest snapshot of files during the past 30 days. Snapshots are taken when files are created or modified. Note: Snapshots are also taken periodically and independently of the changes. The snapshot_timestamp records when the snapshot was taken.';
-- Column comments
COMMENT ON COLUMN FILE_LATEST.CHANGE_TYPE IS 'The type of change that occurred on the file handle, e.g., CREATE, UPDATE, DELETE.';
COMMENT ON COLUMN FILE_LATEST.CHANGE_TIMESTAMP IS 'The time when the change (created/updated/deleted) on the file is pushed to the queue for snapshotting.';
COMMENT ON COLUMN FILE_LATEST.CHANGE_USER_ID IS 'The unique identifier of the user who made the change to the file.';
COMMENT ON COLUMN FILE_LATEST.SNAPSHOT_TIMESTAMP IS 'The time when the snapshot was taken (It is usually after the change happened).';
COMMENT ON COLUMN FILE_LATEST.ID IS 'The unique identifier of the file handle.';
COMMENT ON COLUMN FILE_LATEST.CREATED_BY IS 'The unique identifier of the user who created the file handle.';
COMMENT ON COLUMN FILE_LATEST.CREATED_ON IS 'The creation timestamp of the file handle.';
COMMENT ON COLUMN FILE_LATEST.MODIFIED_ON IS 'The most recent change time of the file handle.';
COMMENT ON COLUMN FILE_LATEST.CONCRETE_TYPE IS 'The type of the file handle. Allowed file handles are: S3FileHandle, ProxyFileHandle, ExternalFileHandle, ExternalObjectStoreFileHandle, GoogleCloudFileHandle.';
COMMENT ON COLUMN FILE_LATEST.CONTENT_MD5 IS 'The md5 hash (using MD5 algorithm) of the file referenced by the file handle.';
COMMENT ON COLUMN FILE_LATEST.CONTENT_TYPE IS 'Metadata about the content of the file, e.g., application/json, application/zip, application/octet-stream.';
COMMENT ON COLUMN FILE_LATEST.FILE_NAME IS 'The name of the file referenced by the file handle.';
COMMENT ON COLUMN FILE_LATEST.STORAGE_LOCATION_ID IS 'The identifier of the environment, where the physical files are stored.';
COMMENT ON COLUMN FILE_LATEST.CONTENT_SIZE IS 'The size of the file referenced by the file handle.';
COMMENT ON COLUMN FILE_LATEST.BUCKET IS 'The bucket where the file is physically stored. Applicable for s3 and GCP, otherwise empty.';
COMMENT ON COLUMN FILE_LATEST.KEY IS 'The key name uniquely identifies the object (file) in the bucket.';
COMMENT ON COLUMN FILE_LATEST.PREVIEW_ID IS 'The identifier of the file handle that contains a preview of the file referenced by this file handle.';
COMMENT ON COLUMN FILE_LATEST.IS_PREVIEW IS 'If true, the file referenced by this file handle is a preview of another file.';
COMMENT ON COLUMN FILE_LATEST.STATUS IS 'The availability status of the file referenced by the file handle. AVAILABLE: accessible via Synapse; UNLINKED: not referenced by Synapse and therefore available for garbage collection; ARCHIVED: the file has been garbage collected.';
COMMENT ON COLUMN FILE_LATEST.SNAPSHOT_DATE IS 'The data is partitioned for fast and cost effective queries. The snapshot_timestamp field is converted into a date and stored in the snapshot_date field for partitioning. The date should be used as a condition (WHERE CLAUSE) in the queries.';
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- Backup the original latest table
USE SCHEMA {{database_name}}.synapse; --noqa: JJ01,PRS,TMP
-- Clone the FILE_LATEST table to ``FILE_LATEST_BACKUP`` for validation purposes
CREATE TABLE IF NOT EXISTS FILE_LATEST_BACKUP CLONE FILE_LATEST;
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-- Drop FILE_LATEST table
USE SCHEMA {{database_name}}.synapse;
DROP TABLE IF EXISTS FILE_LATEST;
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-- Drop the snapshot stream
USE SCHEMA {{database_name}}.synapse_raw;
DROP STREAM IF EXISTS FILESNAPSHOTS_STREAM;
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
-- Drop any scheduled tasks
USE SCHEMA {{database_name}}.synapse_raw;
-- Suspend ROOT TASK
ALTER TASK REFRESH_SYNAPSE_WAREHOUSE_S3_STAGE_TASK SUSPEND;
-- Drop UPSERT_TO_FILE_LATEST_TASK
DROP TASK IF EXISTS UPSERT_TO_FILE_LATEST_TASK;
-- Drop REMOVE_DELETE_FILES_TASK
DROP TASK IF EXISTS REMOVE_DELETE_FILES_TASK;
-- Resume the ROOT task and its child tasks
SELECT SYSTEM$TASK_DEPENDENTS_ENABLE( 'REFRESH_SYNAPSE_WAREHOUSE_S3_STAGE_TASK' );