- Trending YouTube Video Statistics. The data should be placed in a folder named
raw_dataset
-
pre_processing.py
: Notebook to process the raw dataset downloaded from Kaggle. Running this script creates a directory namedclean_dataset
, which contains the cleaned.csv
files. The script takes two arguments:frac
andthreshold
.frac
represents what fraction of the dataset to take from the raw data, andthreshold
will remove tags that appear less thanthreshold
times. -
create_rdf.py
: Reads the cleaned data and uses Tarql to convert the.csv
files into RDF triples. TheCONSTRUCT
queries in SPARQL are placed in thesparql
folder. Running this script will create.ttl
files. The script will place the.ttl
files into therdf_dataset
directory. The.ttl
files can now be loaded into a SPARQL endpoint to run queries.
PREFIX tag: <http://www.holygoat.co.uk/owl/redwood/0.1/tags/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX ex: <http://ex.org/>
PREFIX ct: <http://ex.org/country/>
PREFIX tg: <http://ex.org/tag/>
PREFIX cat: <http://ex.org/category/>
PREFIX ch: <http://ex.org/channel/>
SELECT ?country (AVG(?views) as ?avg) WHERE {
?country a ex:Country .
?video a ex:Video .
?video ex:country ?country .
?video ex:views ?views
} GROUP BY(?country)
ORDER BY DESC(?avg)
SELECT ?channel (MIN(?views) as ?min_video) (MAX(?views) as ?max_video) WHERE{
?channel a ex:Channel .
?video ex:channel_title ?channel .
?video ex:views ?views .
?video a ex:Video .
} GROUP BY ?channel HAVING(COUNT(*)>10)
ORDER BY DESC(?min_video)
LIMIT 10
SELECT ?category ?tag ?cnt WHERE {
{SELECT DISTINCT ?category (MAX(?cnt) as ?MaxCount) WHERE {
{SELECT DISTINCT ?category ?tag (COUNT(?tag) as ?cnt) WHERE {
?category a ex:Category .
?video ex:category ?category .
?video ex:hasTag ?tag
}
GROUP BY ?category ?tag
ORDER BY DESC(?cnt)}
} GROUP BY ?category
ORDER BY DESC(?MaxCount)}
{SELECT DISTINCT ?category ?tag (COUNT(?tag) as ?cnt) WHERE {
?category a ex:Category .
?video ex:category ?category .
?video ex:hasTag ?tag
}
GROUP BY ?category ?tag
ORDER BY DESC(?cnt)}
FILTER(?cnt = ?MaxCount)
}
4. Find the channels with at least 15 videos or categories that have fastest time for a video to become trending (time_to_trending = trending_date - publish_date).
## Query for categories
SELECT ?category ?time_to_trending_average_hours WHERE{
{SELECT ?category (AVG(?time_in_seconds) as ?average_time_to_trending_seconds) WHERE {
?video a ex:Video .
?video ex:category ?category .
?video ex:title ?title .
?video ex:publish_timestamp ?publish_timestamp .
?video ex:trending_timestamp ?trending_timestamp
BIND(xsd:dateTime(?trending_timestamp) - xsd:dateTime(?publish_timestamp) AS ?time2trending)
BIND(day(?time2trending) AS ?days)
BIND(hours(?time2trending) AS ?hours)
BIND(minutes(?time2trending) AS ?minutes)
BIND(seconds(?time2trending) AS ?seconds)
BIND( (?days*86400 + ?hours*3600 + ?minutes*60 + ?seconds) AS ?time_in_seconds)
}GROUP BY ?category}
BIND(ceil(?average_time_to_trending_seconds/3600) AS ?time_to_trending_average_hours)
}ORDER BY ASC(?time_to_trending_average_hours)
## Query for channels with at least 15 videos
SELECT ?channel_title ?time_to_trending_average_hours WHERE{
{SELECT ?channel_title (AVG(?time_in_seconds) as ?average_time_to_trending_seconds) WHERE {
?video a ex:Video .
?video ex:channel_title ?channel_title .
?video ex:publish_timestamp ?publish_timestamp .
?video ex:trending_timestamp ?trending_timestamp
BIND(xsd:dateTime(?trending_timestamp) - xsd:dateTime(?publish_timestamp) AS ?time2trending)
BIND(year(?time2trending) AS ?years)
BIND(month(?time2trending) AS ?months)
BIND(day(?time2trending) AS ?days)
BIND(hours(?time2trending) AS ?hours)
BIND(minutes(?time2trending) AS ?minutes)
BIND(seconds(?time2trending) AS ?seconds)
BIND( (?days*86400 + ?hours*3600 + ?minutes*60 + ?seconds) AS ?time_in_seconds)
}GROUP BY ?channel_title HAVING(COUNT(*)>4)}
BIND(ceil(?average_time_to_trending_seconds/3600) AS ?time_to_trending_average_hours)
}ORDER BY ASC(?time_to_trending_average_hours) LIMIT 10
SELECT (MAX(?performance) as ?m_p) WHERE {
?tag a tg:Funny .
?video a ex:Video .
?video ex:hasTag ?tag .
?video ex:category ?cat .
?video ex:views ?views .
?video ex:likes ?likes .
?video ex:dislikes ?dislikes .
?video ex:comment_count ?comments .
BIND(((?likes + ?comments)/?views) as ?performance) .
}GROUP BY ?tag ?cat
ORDER BY DESC(?m_p)
LIMIT 10