-
Notifications
You must be signed in to change notification settings - Fork 0
/
SparkStreamingIngestion.py
69 lines (57 loc) · 2.21 KB
/
SparkStreamingIngestion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 13 10:16:58 2019
@3301
"""
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json
import time
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import HiveContext
sc = SparkContext(appName="PysparkNotebook")
sqlContext = HiveContext(sc)
ssc = StreamingContext(sc, 30) ###to_change
kafkaStream = KafkaUtils.createStream(ssc,
'quickstart.cloudera:2181',
'spark-streaming',
{'twitter_feed':1})
# funciton to convert json into tuple
def extract_user(raw):
return ((raw["user"])["id"],
(raw["user"])["name"],
(raw["user"])["screen_name"],
(raw["user"])["description"],
(raw["user"])["location"],
(raw["user"])["created_at"],
(raw["user"])["friends_count"],
(raw["user"])["followers_count"],
(raw["user"])["statuses_count"],
(raw["user"])["verified"],
time.time()
)
#schema definition for user table in hive
user_schema = StructType([
StructField("id", LongType(), True),
StructField("name", StringType(), True),
StructField("screen_name", StringType(), True),
StructField("description", StringType(), True),
StructField("location", StringType(), True),
StructField("created_at", StringType(), True),
StructField("friends_count", LongType(), True),
StructField("followers_count", LongType(), True),
StructField("statuses_count", LongType(), True),
StructField("verified", StringType(), True),
StructField("s_created_at", DoubleType(), True),
])
def processrdd(rdd):
data_df = sqlContext.createDataFrame(rdd,schema=user_schema) ## mapping tuple with schema
data_df.write.mode('append').saveAsTable("default.twitter_user") #appending data to hive table
parsed = kafkaStream.map(lambda v: json.loads(v[1].decode('utf-8'))) ##called as DStreams
parsed.map(lambda x : extract_user(x)).foreachRDD(processrdd) ## converting Dstreams into RDD
ssc.start() #kafka streams application will start
ssc.awaitTermination()