-
Notifications
You must be signed in to change notification settings - Fork 2
/
Categorywise_tpVideo.scala
64 lines (48 loc) · 1.95 KB
/
Categorywise_tpVideo.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
package tube.analysis
import scala.collection.mutable.ListBuffer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
/**
*
* This program finds the top 10 video from each category from data based on the rating
*
*
* Created by maniram on 10/2/18.
*/
object Categorywise_tpVideo {
Logger.getLogger("org").setLevel(Level.ERROR)
def main(args: Array[String]) {
val conf = new SparkConf().setMaster("local[3]").setAppName("Count_video_in_Category")
val sc = new SparkContext(conf)
val raw_data = sc.textFile("/home/maniram/data/youtubedata.txt")
// Extract category
val Categories = raw_data.filter(line => line.split("\\t").length > 4).map { x =>
val cat = x.split("\\t")(3)
(cat)
}
val categories_set = Categories.distinct().collect()
//Mapper
val values = raw_data.filter(line => line.split("\\t").length > 7).map { line =>
val lst = line.split("\\t")
(lst(0), lst(6).toFloat,lst(3))
}
// Map the values into new map rating value as key for soring by key
val ratingAsKey=values.map{case(x,y,z)=>(y,x,z)}
var top10=ratingAsKey.take(0); // define a variable of type rdd data for using in reducer code
var cat_top10=ratingAsKey.take(0); //// define a variable of type rdd data for using in reducer code
// Reducer
categories_set.foreach{ x =>
top10 = ratingAsKey.filter{case(rate,videoid,cats) => cats==x}
.sortBy(_._1,ascending = false).take(10)
//top10.foreach(println)
cat_top10 = cat_top10.union(top10)
}
// Converting tuploe map into rdd for storage purpose
val cat_top10RDD = sc.parallelize(cat_top10,1)
cat_top10RDD.saveAsTextFile("/home/maniram/data/TubeAnalysis/Category_Top10_videos")
println("--------------------")
println("----- Top 10 of each Category -------")
println("--------------------")
cat_top10.foreach(println)
sc.stop()
}}