This repository has been archived by the owner on Sep 1, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathHive_Tutorial.hql
34 lines (30 loc) · 1.88 KB
/
Hive_Tutorial.hql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
-- Count the total number of document modifications (creations or updates) by the minute using the system generated _ts
-- Read from two input collections and store ouput in a separate collection
-- Add dependencies
add JAR <path to Hive JSON SerDe jar/json-serde-parent-SNAPSHOT.jar>;
add JAR <path to DocumentDB Java SDK jar/azure-documentdb-SNAPSHOT-dependencies.jar>;
add JAR <path to DocumentDB Hadoop jar/azure-documentdb-hadoop-SNAPSHOT.jar>;
-- Create a Hive Table from DocumentDB ids and timestamps
drop table DocumentDB_timestamps;
create external table DocumentDB_timestamps(id string, ts BIGINT)
stored by 'com.microsoft.azure.documentdb.hive.DocumentDBStorageHandler'
tblproperties (
'DocumentDB.endpoint' = 'DocumentDB Endpoint',
'DocumentDB.key' = 'DocumentDB Primary Key',
'DocumentDB.db' = 'DocumentDB Database Name',
'DocumentDB.inputCollections' = 'DocumentDB Input Collection Name 1,Document Input Collection Name 2',
'DocumentDB.query' = 'SELECT r._rid AS id, r._ts AS ts FROM root r' );
-- Create a Hive Table for outputting to DocumentDB
drop table DocumentDB_analytics;
create external table DocumentDB_analytics(Month INT, Day INT, Hour INT, Minute INT, Total INT)
stored by 'com.microsoft.azure.documentdb.hive.DocumentDBStorageHandler'
tblproperties (
'DocumentDB.endpoint' = 'DocumentDB Endpoint',
'DocumentDB.key' = 'DocumentDB Primary Key',
'DocumentDB.db' = 'DocumentDB Database Name',
'DocumentDB.outputCollections' = 'DocumentDB Output Collection Name' );
-- Insert aggregations to Output Hive Table
INSERT INTO table DocumentDB_analytics
SELECT month(from_unixtime(ts)) as Month, day(from_unixtime(ts)) as Day, hour(from_unixtime(ts)) as Hour, minute(from_unixtime(ts)) as Minute, COUNT(*) AS Total
FROM DocumentDB_timestamps
GROUP BY month(from_unixtime(ts)), day(from_unixtime(ts)), hour(from_unixtime(ts)) , minute(from_unixtime(ts));