Project_Code_Thanapoom.py

# -*- coding: utf-8 -*-
"""DA_Proficiency_Insight_Thanapoom.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1BdlbKVMPt_BoTZgq_tcdCDGmAzDdWfZQ

# Project name: Data Analyst Proficiency Insight: A Comprehensive Skills and Qualifications Analysis


**Name**: Thanapoom Phatthanaphan <br>
**CWID**: 20011296

## Introduction
In today's job market, there's a growing need for people who are good at data analysis. This demand is especially high in sectors like Banking, Media & Entertainment, and Healthcare, where working with big sets of data is common. Many individuals are keen on starting a career in technology, specifically as a data analyst.
However, becoming a data analyst involves learning various skills. To make things a bit tricky, different industries look for different skills. For those who are just starting out, figuring out which skills are most important can be a bit confusing.
Our project is here to help with that. We're collecting information from popular job websites and carefully studying job descriptions to figure out the main skills you need to be a successful data analyst. The goal is to make things easier for people who are just starting out, giving them a clear idea of the skills that really matter in the real world of data analysis jobs. This project is all about helping newcomers make smart choices when it comes to developing the skills they need to succeed in the field.

## Objectives and expected contributions

The primary objectives of this project are to facilitate individuals in discerning the pivotal skills requisite for a data analyst role and to offer comprehensive insights into the skill sets sought after by various industries. Employing the Python programming language, coupled with Natural Language Processing (NLP) techniques, we aspire to conduct a rigorous analysis of job descriptions pertinent to data analyst positions.

## Methodology

The project's methodology employs a multifaceted approach to achieve its objectives. Initially, Python libraries like BeautifulSoup and Scrapy are utilized for web scraping, enabling the extraction of relevant data from leading job websites featuring data analyst positions. Subsequently, the collected job descriptions undergo thorough preprocessing to ensure data integrity and suitability for analysis. This preprocessing includes removing rows with missing values, eliminating stopwords and punctuation, converting text to lowercase, and lemmatizing words to standardize the dataset. Following preprocessing, the dataset is segmented into unigrams and bigrams, facilitating a detailed analysis of individual words and adjacent word pairs. After that, we use Rule-based approach and Word Embeddings approach to match words in the job descriptions with predefined skill keywords then we select the one with better performance for identifying the relevant skills. Finally, the results are visualized to pinpoint which skills are in highest demand in the data analyst job market.

# Download and import the necessary libraries
"""

# Access to the drive that we want to work with and save the file
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')

# Download necessary libraries
!pip install selenium
!pip install wordcloud
!pip install fuzzywuzzy

# Import necessary libraries
# Web Scraping
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Time Recording
import time

# Data Analysis
import numpy as np
import pandas as pd

# Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import string
import spacy
from fuzzywuzzy import fuzz
from nltk.util import ngrams
from collections import defaultdict

# Data Visualization (Plotting)
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Word Embeddings
from gensim.models import Word2Vec
import gensim.downloader as api

# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

"""# 1) Data Collection

### Web Scraping
Scraped the job details from Indeed.com, and save to .csv file
"""

def getJobDescription(page_url_format, max_page):

    # Start the timer
    start = time.time()

    # Initialize the list to store the job details
    jobs_list = []
    job_search = 'data+analyst'
    location_search = 'United+States'

    # Iterate to open each page of the website
    for i in range(0, max_page):
        page_url = page_url_format.format(job_search, location_search, i*10)
        driver = webdriver.Chrome()
        driver.get(page_url)

        # Set the loading time for the webpage
        time.sleep(10)

        # Find the path of th data that we want to scrape
        job_page = driver.find_element(By.ID, 'mosaic-jobResults')
        jobs = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')

        # Iterate to get the job detail of each job
        for job in jobs:
            job_detail = []
            job_des_list = []

            # Get the job Title
            job_title = job.find_element(By.CLASS_NAME, 'jobTitle')
            job_detail.append(job_title.text)

            # Get the job link
            job_detail.append(job_title.find_element(By.CSS_SELECTOR, 'a').get_attribute('href'))

            # Get the company name
            job_detail.append(job.find_element(By.CLASS_NAME, 'css-92r8pb').text)

            # Get the location of the job
            job_detail.append(job.find_element(By.CLASS_NAME, 'css-1p0sjhy').text)

            # Click the job element
            job_title.click()

            # Set the loading time for the webpage
            time.sleep(8)

            # Get the job description
            try:
                job_des_list.append(driver.find_element(By.ID, 'jobDescriptionText').text)
            except:
                job_des_list.append(None)

            # Store the job detail in the list
            job_detail.append(job_des_list[0])
            jobs_list.append(job_detail)

    # Close the webpage
    driver.quit()

    # Stop the timer & Display the total execution time
    end = time.time()
    execution_time = end - start
    print(f"{execution_time} seconds to complete the task.")

    return jobs_list

# initialize the webpage url
page_url_format = "https://www.indeed.com/jobs?q={}&l={}&start={}"

# Define the number of page we required for around 1,000 jobs (The website contains 15 jobs per page)
max_page = int(np.ceil(1000/15))

# Call the function to get the job details from the website
jobs_list = getJobDescription(page_url_format, max_page)

# Create the DataFrame
columns_name = ['Position', 'Link', 'Company', 'Location', 'Job Description']
jobs_list_table = pd.DataFrame(jobs_list, columns=columns_name)

# Save the DataFrame to .csv file
jobs_list_table.to_csv('jobs_list.csv', index=False)
print("The DataFrame has been saved.")

"""### Import the dataset
The dataset that we scraped from the website
"""

df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/jobs_list.csv')
df

# Check null value
df.info()

# Remove the row with null values
df_no_na = df.dropna()
df_no_na

# Select only the column that we will use
data = df_no_na.loc[:, ['Job Description']]
data

"""# 2) Data Preprocessing
Apply NLP techniques to the text data into the suitable format
"""

def tokenization(doc):

    # Tokenize text
    tokens = nltk.word_tokenize(doc)

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in punctuation]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Convert word to lowercase
    lower_lemmatized_tokens = [token.lower() for token in lemmatized_tokens]

    return lower_lemmatized_tokens

# Call the function to tokenize the text
data['Job Description'] = data['Job Description'].apply(tokenization)

# Extract the data to get unigram and bigram
uni_bi_grams_dict = {'JD_uni_bi_grams': []}
for jd in data['Job Description']:
    unigrams = list(ngrams(jd, 1))
    bigrams = list(ngrams(jd, 2))
    uni_bi_grams = unigrams + bigrams
    uni_bi_grams_dict['JD_uni_bi_grams'].append(uni_bi_grams)

jd_uni_bi_grams = pd.DataFrame(uni_bi_grams_dict)

jd_uni_bi_grams.head()

"""### Creating the reference of required skills
Build the reference of required skills by manually reading through 50 job posts to extract the requird skills and keywords.
"""

# Define the skills keywords
skill_keywords = {
    'Python': ['python'],
    'SQL': ['sql', 'structured query language'],
    'R': ['r'],
    'VBA': ['vba', 'visual basic'],
    'C': ['c'],
    'C++': ['c++'],
    'C#': ['c#'],
    'Java': ['java'],
    'JavaScript': ['javascript'],
    'HTML': ['html', 'hypertext markup language'],
    'Ruby': ['ruby'],
    'RDBMS': ['rdbms', 'relational database', 'postgressql', 'mysql', 'oracle',
              'sql server', 'sql', 'structured query language', 'access', 'query', 'querying'],
    'NoSQL': ['nosql', 'mongodb', 'cassandra'],
    'Access': ['access', 'ms access', 'microsoft access'],
    'Excel': ['excel', 'microsoft excel', 'ms excel'],
    'Word': ['word', 'ms word', 'microsoft word'],
    'PowerPoint': ['ppt', 'powerpoint', 'ms powerpoint', 'microsoft powerpoint'],
    'Sharepoint': ['sharepoint'],
    'ETL': ['etl'],
    'Oracle': ['oracle'],
    'Snowflake': ['snowflake'],
    'Tableau': ['tableau'],
    'PowerBI': ['power bi', 'bi'],
    'Looker': ['looker'],
    'QilkView': ['qilkview'],
    'MicroStrategy': ['microstrategy'],
    'Plotly': ['plotly'],
    'Matplotlib': ['matplotlib'],
    'Seaborn': ['seaborn'],
    'Pandas': ['pandas'],
    'NumPy': ['numpy'],
    'Statistics': ['statistics'],
    'Probability': ['probability'],
    'Hypothesis Testing': ['hypothesis testing'],
    'A/B Testing': ['a/b testing', 'ab testing'],
    'MATLAB': ['matlab'],
    'Time Series': ['time series', 'forecasting'],
    'Regression': ['regression'],
    'Classification': ['classification'],
    'Clustering': ['clustering'],
    'Predictive Modeling': ['predictive modeling'],
    'Tensorflow': ['tensorflow'],
    'Pytorch': ['pytorch'],
    'Scikit-Learn': ['scikit-learn'],
    'Hadoop': ['hadoop'],
    'Spark': ['spark'],
    'Hive': ['hive'],
    'Databricks': ['databricks'],
    'SAP': ['sap'],
    'SCM': ['scm', 'supply chain management'],
    'CRM': ['crm', 'customer relationship management', 'salesforce'],
    'ERP': ['erp', 'enterprise resource planning'],
    'SAAS': ['saas', 'software as a service'],
    'PeopleSoft': ['peoplesoft'],
    'Agile': ['agile'],
    'Teamwork': ['team-oriented environment', 'team-oriented', 'collaboration', 'cooperation',
                 'teamwork', 'team management', 'team environment', 'business partners',
                 'working relationships', 'team-centric', 'collaborative spirit'],
    'Presentation': ['presentation', 'present'],
    'Reporting': ['reporting', 'report'],
    'Verbal': ['verbal', 'oral', 'well-spoken'],
    'Written': ['writing', 'written'],
    'Detail-oriented': ['detail-oriented', 'attention to detail', 'eye for detail', 'accuracy'],
    'Motivation': ['motivation', 'ambition', 'willingness to learn', 'delivering result',
                   'continuous learning', 'self-motivation', 'work independently',
                   'self-motivated', 'self-learner', 'self-directed'],
    'Adaptability': ['adaptability', 'flexible', 'multitasking', 'fast-paced'],
    'Good attitude': ['attitude', 'self-learner', 'self-directed',
                      'under pressure', 'high-pressure'],
    'Problem-solving': ['problem-solving', 'problem solving'],
    'Critical-thinking': ['critical-thinking', 'critical-thinker', 'critical thinking'],
    'Time management': ['time management', 'timely manner', 'prioritize time',
                        'deadline-driven', 'meet deadlines'],
    'Project management': ['project management']
}

# Initialize the dictionary for counting the skills
individual_skill_dict = {}
for skill in skill_keywords:
    individual_skill_dict[skill] = list(0 for i in range(len(jd_uni_bi_grams)))

# Create the dataframe from the dictionary
individual_skill_counts_df = pd.DataFrame.from_dict(individual_skill_dict)

individual_skill_counts_df.head()

"""# 3) Exploratory Data Analysis

## Rule-based approach

Count the required skills from unigrams and bigrams that we got in the previous step using Fuzzywuzzy.
"""

def fuzzy_match(text, keywords):

    """Function to perform fuzzy matching"""

    # Iterate over the keywords to match keyword with the highest similarity ratio to the text
    for keyword in keywords:
        score = fuzz.ratio(text, keyword)
        if score >= 90:
            return score
    return score


def count_skills_rule_based(gram_texts, skill_keywords, skill_counts_df):

    """Function to detect skills from the job descriptions using Rule-based approach"""

    # Start the timer
    start = time.time()

    # Iterate over the keywords and check for matches
    for i, words in enumerate(gram_texts):
        for word in words:
            for skill, keywords in skill_keywords.items():
                # Apply the fuzzy_match function to the current gram text and keyword
                score = fuzzy_match(word, keywords)
                if score >= 90:
                    skill_counts_df.loc[i, skill] = 1
        if (i + 1) % 5 == 0 or i == len(gram_texts) - 1:
            print(f"Done {i + 1} jobs")

    # Stop the timer & Display the total execution time
    end = time.time()
    execution_time = end - start
    print(f"{execution_time} seconds to complete the rule-based approach with {len(gram_texts)} jobs.")

    return skill_counts_df

# Build a list of words for skills detection
gram_texts = []
for grams_list in jd_uni_bi_grams['JD_uni_bi_grams']:
    temp_list = []
    for gram in grams_list:
        temp_list.append(" ".join(gram))
    gram_texts.append(temp_list)

# Call the function to count the required skills using Rule-based approach
individual_skill_counts_rule_based = count_skills_rule_based(gram_texts, skill_keywords, individual_skill_counts_df)

# Save to .csv file
individual_skill_counts_df.to_csv('/content/individual_skill_counts_rule_based.csv', index=False)
files.download('/content/individual_skill_counts_rule_based.csv')

# Import the dataset that has already counted the skills
# individual_skill_counts_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/individual_skill_counts_rule_based.csv')

# Check the dataset
individual_skill_counts_rule_based.head(10)

# Get the sum of required skills for each individual skill from rule-based approach
sum_skills = {}
for skill in skill_keywords:
    sum_skills[skill] = individual_skill_counts_rule_based[skill].sum()
sum_skills_rule_based = pd.DataFrame(sum_skills, index=['Number of jobs'])

# Show the number of jobs require the specific skills by each skill from rule-based approach
rearranged_sum_skills_rule_based = sum_skills_rule_based.T.sort_values(by='Number of jobs', ascending=False)
rearranged_sum_skills_rule_based['%Percentage (1,004 jobs)'] = (rearranged_sum_skills_rule_based['Number of jobs']*100/len(individual_skill_counts_rule_based)).round(2).astype(str) + '%'
rearranged_sum_skills_rule_based.head(10)

"""**Bar chart**: Display the distribution of required skills in 1,004 Data Analyst jobs (Rule-based approach)"""

plt.figure(figsize=(25, 6))
plt.xticks(rotation=90, fontsize=10)
for i, value in enumerate(rearranged_sum_skills_rule_based['Number of jobs']):
    plt.text(i, value + 1, str(value), ha='center', va='bottom', fontsize=9)

plt.bar(rearranged_sum_skills_rule_based.index, rearranged_sum_skills_rule_based['Number of jobs'], align='center')
plt.xlabel('Required Skills')
plt.ylabel('Number of jobs')
plt.title('Distribution of Required Skills in Data Analyst jobs (Rule-based approach)')
plt.show()

"""**Word cloud**: Display the top required skills for Data Analyst jobs (Rule-based approach)"""

sum_individual_skills_dict = dict(zip(rearranged_sum_skills_rule_based.index, rearranged_sum_skills_rule_based['Number of jobs']))
sum_individual_skills_wordcloud_rule_based = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(sum_individual_skills_dict)
plt.figure(figsize=(10, 5))
plt.imshow(sum_individual_skills_wordcloud_rule_based, interpolation='bilinear')
plt.axis('off')
plt.title('Top Required Skills for Data Analyst jobs (Rule-based approach)')
plt.show()

"""## Word Embeddings approach"""

# Train Word2Vec model
# word2vec_model = Word2Vec(sentences=data['Job Description'], vector_size=300, window=2, min_count=1, workers=4)
model = api.load("word2vec-google-news-300")

def get_vector_unique_corpus(uni_bi_grams, model):

    """Function to build the dictionary of unique words and get the vector of each word."""

    # Iterate to get the vector of each unique word.
    corpus_dict = {}
    for grams_list in uni_bi_grams['JD_uni_bi_grams']:
        for gram in grams_list:
            word = " ".join(gram)
            if word not in corpus_dict:
                if word in model.key_to_index:
                    corpus_dict[word] = model[word]

    return corpus_dict

# Call the function to get the vector of the unique words
unique_corpus = get_vector_unique_corpus(jd_uni_bi_grams, model)
unique_corpus

# Initialize the dataframe to countthe required skills
individual_skill_counts_word_embeddings = individual_skill_counts_df.copy()

def count_skills_word_embeddings(job_descriptions, skill_keywords, model, skill_counts_df):

    """Function to detect skills from the job descriptions using Rule-based approach"""

    # Start the timer
    start = time.time()

    # Iterate over the skill keyword and detect the keyword from job descriptions
    lemmatizer = WordNetLemmatizer()
    for i, words in enumerate(job_descriptions['JD_uni_bi_grams']):
        for word in words:
            for skill in skill_keywords.keys():
                converted_skill = lemmatizer.lemmatize(skill.lower())
                if converted_skill in model.key_to_index and word in model.key_to_index:
                    sim_score = model.similarity(word, converted_skill)
                    if sim_score > 0.5:
                        skill_counts_df.loc[i, skill] = 1
        if (i + 1) % 5 == 0 or i == len(job_descriptions) - 1:
            print(f"Done {i + 1} jobs")

    # Stop the timer & Display the total execution time
    end = time.time()
    execution_time = end - start
    print(f"{execution_time} seconds to complete the word embeddings approach with {len(job_descriptions)} jobs.")

    return skill_counts_df

# Call the function to count the required skills using word embeddings approach
individual_skill_counts_word_embeddings = count_skills_word_embeddings(jd_uni_bi_grams, skill_keywords, model, individual_skill_counts_df)

# Save to .csv file
individual_skill_counts_word_embeddings.to_csv('/content/individual_skill_counts_word_embeddings.csv', index=False)
files.download('/content/individual_skill_counts_word_embeddings.csv')

# Check the dataset
individual_skill_counts_word_embeddings.head(10)

# Get the sum of required skills for each individual skill from word embeddings approach
sum_skills = {}
for skill in skill_keywords:
    sum_skills[skill] = individual_skill_counts_word_embeddings[skill].sum()
sum_skills_word_embeddings = pd.DataFrame(sum_skills, index=['Number of jobs'])

# Show the number of jobs require the specific skills by each skill from word embeddings approach
rearranged_sum_skills_word_embeddings = sum_skills_word_embeddings.T.sort_values(by='Number of jobs', ascending=False)
rearranged_sum_skills_word_embeddings['%Percentage (1,004 jobs)'] = (rearranged_sum_skills_word_embeddings['Number of jobs']*100/len(individual_skill_counts_word_embeddings)).round(2).astype(str) + '%'
rearranged_sum_skills_word_embeddings.head(10)

"""**Bar chart**: Display the distribution of required skills in 1,004 Data Analyst jobs (Word embeddings approach)"""

plt.figure(figsize=(25, 6))
plt.xticks(rotation=90, fontsize=10)
for i, value in enumerate(rearranged_sum_skills_word_embeddings['Number of jobs']):
    plt.text(i, value + 1, str(value), ha='center', va='bottom', fontsize=9)

plt.bar(rearranged_sum_skills_word_embeddings.index, rearranged_sum_skills_word_embeddings['Number of jobs'], align='center')
plt.xlabel('Required Skills')
plt.ylabel('Number of jobs')
plt.title('Distribution of Required Skills in Data Analyst jobs (Word Embeddings approach)')
plt.show()

"""**Word cloud**: Display the top required skills for Data Analyst jobs (Word embeddings approach)"""

sum_individual_skills_dict_word_embeddings = dict(zip(rearranged_sum_skills_word_embeddings.index, rearranged_sum_skills_word_embeddings['Number of jobs']))
sum_individual_skills_wordcloud_word_embeddings = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(sum_individual_skills_dict_word_embeddings)
plt.figure(figsize=(10, 5))
plt.imshow(sum_individual_skills_wordcloud_word_embeddings, interpolation='bilinear')
plt.axis('off')
plt.title('Top Required Skills for Data Analyst jobs (Word Embeddings approach)')
plt.show()

"""## Evaluation for selecting the better approach"""

# Import the labeled dataset that I manually read to detect required skills from job description (containing 200 jobs)
ground_truth = pd.read_csv("/content/drive/My Drive/Colab Notebooks/labeled_dataset_200.csv")
ground_truth.head(10)

def evaluation(ground_truth, detected_skills):

    """Function to evaluate the performance of the approach"""

    result = {}
    accuracy_scores, precision_scores, recall_scores, f1_scores = [], [], [] ,[]

    for skill in ground_truth.columns[5:]:

        # Extract true and predicted labels for the current skill
        ground_truth_skill = ground_truth[skill]
        detected_skill = detected_skills[skill]

        # Compute evaluation metrics for the current skill
        accuracy = accuracy_score(ground_truth_skill, detected_skill)
        precision = precision_score(ground_truth_skill, detected_skill)
        recall = recall_score(ground_truth_skill, detected_skill)
        f1 = f1_score(ground_truth_skill, detected_skill)

        # Append scores to the corresponding lists
        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Calculate the average scores for each metric
    average_accuracy = str((sum(accuracy_scores) * 100 / len(accuracy_scores)).round(2)) + '%'
    average_precision = str((sum(precision_scores) * 100 / len(precision_scores)).round(2)) + '%'
    average_recall = str((sum(recall_scores) * 100 / len(recall_scores)).round(2)) + '%'
    average_f1 = str((sum(f1_scores) * 100 / len(f1_scores)).round(2)) + '%'

    # Put the average score into the result dictionary
    result["Accuracy"] = average_accuracy
    result["Precision"] = average_precision
    result["Recall"] = average_recall
    result["F1 scores"] = average_f1

    return result

# Call the function to evaluate the approachs
result_rule_based = evaluation(ground_truth, individual_skill_counts_rule_based[:200])
result_word_embeddings = evaluation(ground_truth, individual_skill_counts_word_embeddings[:200])

# Print to show the preformance of each approach
print(f"Result of Rule-based approach\n{result_rule_based}\n")
print(f"Result of Word embeddings approach\n{result_word_embeddings}")

"""## Summary the performance of both approaches

From the results above, it's evident that both approaches perform equally well in detecting skills from job descriptions. However, the Word Embeddings approach demonstrates significantly faster execution, taking only 213 seconds compared to the Rule-based approach, which requires 3899 seconds.

Therefore, this case we should use Word Embeddings approach.

# 4) Data Intepretation

### 4.1 Overall required skills by each category
"""

skill_categories = {
    'Programming': ['Python', 'SQL', 'R', 'VBA', 'C', 'C++', 'C#', 'Java', 'JavaScript', 'HTML', 'Ruby'],
    'Database Management System': ['RDBMS', 'NoSQL'],
    'Statistics for Data Analysis': ['Excel', 'Statistics', 'Probability', 'Hypothesis Testing',
                                     'A/B Testing', 'MATLAB', 'Time Series', 'Pandas', 'NumPy'],
    'Data Visualization': ['Tableau', 'PowerBI', 'Looker', 'QilkView', 'MicroStrategy',
                           'Plotly', 'Matplotlib', 'Seaborn', 'Excel'],
    'Machine Learning': ['Regression', 'Classification', 'Clustering', 'Predictive Modeling', 'Tensorflow',
                         'Pytorch', 'Scikit-Learn'],
    'Big Data Technologies': ['Hadoop', 'Spark', 'Hive', 'Databricks', 'Snowflake', 'ETL'],
    'Enterprise system': ['SAP', 'SCM', 'CRM', 'ERP', 'SAAS', 'PeopleSoft', 'Oracle', 'Sharepoint'],
    'Communication': ['Presentation', 'Reporting', 'Verbal', 'Written', 'Word', 'PowerPoint'],
    'Employee attributes': ['Teamwork', 'Critical-thinking', 'Time management', 'Project management', 'Agile',
                            'Problem-solving', 'Detail-oriented', 'Motivation', 'Adaptability', 'Good attitude']
}

def sum_skills_categories(skill_categories, skill_keywords, individual_skill_counts):

    # Initialize the dictionary to store the sum of required skills for each category
    sum_categories = {}
    for category in skill_categories:
        sum_categories[category] = 0

    # Iterate to get the sum of required skills for each category
    for i in range(len(individual_skill_counts)):
        count = [0, 0, 0, 0, 0, 0, 0, 0, 0]
        for skill in skill_keywords:
            if all(x == 1 for x in count):
                break
            if individual_skill_counts.loc[i, skill] != 1:
                continue
            for j, category in enumerate(skill_categories):
                if (skill in skill_categories[category]) and (count[j] == 0):
                    count[j] = 1
                    continue
        for k, category in enumerate(skill_categories):
            sum_categories[category] += count[k]

    return sum_categories

sum_categories = sum_skills_categories(skill_categories, skill_keywords, individual_skill_counts_word_embeddings)

# Create the dataframe of the sum of required skills by categories
sum_categories_df = pd.DataFrame(sum_categories, index=['Number of jobs'])

# Show the number of jobs require the specific skills by categories
rearranged_sum_categories_df = sum_categories_df.T.sort_values(by='Number of jobs', ascending=False)
rearranged_sum_categories_df['%Percentage (1,004 jobs)'] = (rearranged_sum_categories_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'

# Check the dataset
rearranged_sum_categories_df

"""**Bar chart**: Display the number of jobs require skills in each category by descending order"""

plt.figure(figsize=(10, 3))
plt.xticks(rotation=45, ha='right', fontsize=12)
for i, value in enumerate(rearranged_sum_categories_df['Number of jobs']):
    plt.text(i, value + 1, f"{value}\n({value*100/len(individual_skill_counts_df):.2f} %)", ha='center', va='bottom', fontsize=10)

plt.bar(rearranged_sum_categories_df.index, rearranged_sum_categories_df['Number of jobs'], align='center')
plt.xlabel('Required Skills', fontsize=12)
plt.ylabel('Number of jobs', fontsize=12)
plt.ylim(0, 1100)
plt.title('Distribution of Required Skills by each category', fontsize=12)
plt.show()

"""### 4.2 Technical & Soft skills

#### 4.2.1 Technical skills
- Programming
- Database Management System
- Statistics for Data Analysis
- Data Visualization
- Machine Learning
- Big Data Technologies
- Enterprise system.
"""

technical_skills_categories = {'Programming',
                               'Database Management System',
                               'Statistics for Data Analysis',
                               'Data Visualization',
                               'Machine Learning',
                               'Big Data Technologies',
                               'Enterprise system'}

# Get the dataframe containing the number of jobs required technical skills by individual skills
technical_skills = set()
for key, value in skill_categories.items():
    if key in technical_skills_categories:
        for skill in value:
            technical_skills.add(skill)
sum_technical_skills = {}
for key, value in sum_skills.items():
    if key in technical_skills:
        sum_technical_skills[key] = value
sum_technical_skills_df = pd.DataFrame(sum_technical_skills, index=['Number of jobs'])

# Get the dataframe containing the number of jobs required technical skills by categories
sum_technical_skills_categories = {}
for key, value in sum_categories.items():
    if key in technical_skills_categories:
        sum_technical_skills_categories[key] = value
sum_technical_skills_categories_df = pd.DataFrame(sum_technical_skills_categories, index=['Number of jobs'])

"""##### By individual skills"""

# Transpose and sort the datafrom to better visual form
rearranged_sum_technical_skills_df = sum_technical_skills_df.T.sort_values(by='Number of jobs', ascending=False)

# Add the column showing the percentage of the required skills from the total jobs
rearranged_sum_technical_skills_df['%Percentage (1,004 jobs)'] = (rearranged_sum_technical_skills_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'
rearranged_sum_technical_skills_df.head(10)

"""**Bar chart**: Display the distribution of required technical skills for Data Analyst jobs by descending order"""

plt.figure(figsize=(25, 6))
plt.xticks(rotation=90, fontsize=10)
for i, value in enumerate(rearranged_sum_technical_skills_df['Number of jobs']):
    plt.text(i, value + 1, str(value), ha='center', va='bottom', fontsize=9)

plt.bar(rearranged_sum_technical_skills_df.index, rearranged_sum_technical_skills_df['Number of jobs'], align='center')
plt.xlabel('Required Technical skills')
plt.ylabel('Number of jobs')
plt.title('Distribution of Required Technical Skills for Data Analyst jobs')
plt.show()

"""**Word cloud**: Display the top required technical skills for Data Analyst jobs"""

sum_individual_technical_skills_dict = dict(zip(rearranged_sum_technical_skills_df.index, rearranged_sum_technical_skills_df['Number of jobs']))
sum_individual_technical_skills_wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(sum_individual_technical_skills_dict)
plt.figure(figsize=(10, 5))
plt.imshow(sum_individual_technical_skills_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Top Required Technical Skills for Data Analyst jobs')
plt.show()

"""##### By categories"""

# Transpose and sort the datafrom to better visual form
rearranged_sum_technical_skills_categories_df = sum_technical_skills_categories_df.T.sort_values(by='Number of jobs', ascending=False)

# Add the column showing the percentage of the required skills from the total jobs
rearranged_sum_technical_skills_categories_df['%Percentage (1,004 jobs)'] = (rearranged_sum_technical_skills_categories_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'
rearranged_sum_technical_skills_categories_df.head(10)

"""**Bar chart**: Display the number of jobs require technical skills by categories"""

plt.figure(figsize=(10, 3))
plt.xticks(rotation=45, ha='right', fontsize=12)
for i, value in enumerate(rearranged_sum_technical_skills_categories_df['Number of jobs']):
    plt.text(i, value + 1, f"{value}\n({value*100/len(individual_skill_counts_df):.2f} %)", ha='center', va='bottom', fontsize=10)

plt.bar(rearranged_sum_technical_skills_categories_df.index, rearranged_sum_technical_skills_categories_df['Number of jobs'], align='center')
plt.xlabel('Required Technical skills', fontsize=12)
plt.ylabel('Number of jobs', fontsize=12)
plt.ylim(0, 1100)
plt.title('Distribution of Required Technical Skills for Data Analyst jobs by categories', fontsize=12)
plt.show()

"""#### 4.2.2 Soft skills
- Communication
- Employee attributes

##### By individual skills
"""

soft_skills_categories = {'Communication', 'Employee attributes'}

# Get the dataframe containing the number of jobs required soft skills by individual skills
soft_skills = set()
for key, value in skill_categories.items():
    if key in soft_skills_categories:
        for skill in value:
            soft_skills.add(skill)
sum_soft_skills = {}
for key, value in sum_skills.items():
    if key in soft_skills:
        sum_soft_skills[key] = value
sum_soft_skills_df = pd.DataFrame(sum_soft_skills, index=['Number of jobs'])

# Get the dataframe containing the number of jobs required soft skills by categories
sum_soft_skills_categories = {}
for key, value in sum_categories.items():
    if key in soft_skills_categories:
        sum_soft_skills_categories[key] = value
sum_soft_skills_categories_df = pd.DataFrame(sum_soft_skills_categories, index=['Number of jobs'])

# Transpose and sort the datafrom to better visual form
rearranged_sum_soft_skills_df = sum_soft_skills_df.T.sort_values(by='Number of jobs', ascending=False)

# Add the column showing the percentage of the required skills from the total jobs
rearranged_sum_soft_skills_df['%Percentage (1,004 jobs)'] = (rearranged_sum_soft_skills_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'
rearranged_sum_soft_skills_df.head(10)

"""**Bar chart**: Display the distribution of required soft skills for Data Analyst jobs by descending order"""

plt.figure(figsize=(25, 6))
plt.xticks(rotation=90, fontsize=12)
for i, value in enumerate(rearranged_sum_soft_skills_df['Number of jobs']):
    plt.text(i, value + 1, str(value), ha='center', va='bottom', fontsize=12)

plt.bar(rearranged_sum_soft_skills_df.index, rearranged_sum_soft_skills_df['Number of jobs'], align='center')
plt.xlabel('Required Soft skills')
plt.ylabel('Number of jobs')
plt.title('Distribution of Required Soft Skills for Data Analyst jobs')
plt.show()

"""**Word cloud**: Display the volumn of each soft skill that is require for Data Analyst jobs"""

sum_individual_soft_skills_dict = dict(zip(rearranged_sum_soft_skills_df.index, rearranged_sum_soft_skills_df['Number of jobs']))
sum_individual_soft_skills_wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(sum_individual_soft_skills_dict)
plt.figure(figsize=(10, 5))
plt.imshow(sum_individual_soft_skills_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Top Required Soft Skills for Data Analyst Jobs')
plt.show()

"""##### By categories"""

# Transpose and sort the datafrom to better visual form
rearranged_sum_soft_skills_categories_df = sum_soft_skills_categories_df.T.sort_values(by='Number of jobs', ascending=False)

# Add the column showing the percentage of the required skills from the total jobs
rearranged_sum_soft_skills_categories_df['%Percentage (1,004 jobs)'] = (rearranged_sum_soft_skills_categories_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'
rearranged_sum_soft_skills_categories_df.head(10)

"""**Bar chart**: Display the distribution of required soft skills for data analyst jobs by categories"""

plt.figure(figsize=(10, 3))
for i, value in enumerate(rearranged_sum_soft_skills_categories_df['Number of jobs']):
    plt.text(i, value + 1, f"{value}\n({value*100/len(individual_skill_counts_df):.2f} %)", ha='center', va='bottom', fontsize=10)

plt.bar(rearranged_sum_soft_skills_categories_df.index, rearranged_sum_soft_skills_categories_df['Number of jobs'], align='center')
plt.xlabel('Required Soft skills', fontsize=12)
plt.ylabel('Number of jobs', fontsize=12)
plt.ylim(0, 1100)
plt.title('Distribution of Required Soft Skills for Data Analyst Jobs by categories', fontsize=12)
plt.show()

"""### 4.3 Programming category"""

# Get the dataframe containing the number of jobs required programming skills
sum_programming_skills = {}
for skill in skill_categories['Programming']:
    sum_programming_skills[skill] = sum_skills[skill]
sum_programming_skills_df = pd.DataFrame(sum_programming_skills, index=['Number of jobs'])
rearranged_sum_progarmming_skills_df = sum_programming_skills_df.T.sort_values(by='Number of jobs', ascending=False)
rearranged_sum_progarmming_skills_df['%Percentage (1,004 jobs)'] = (rearranged_sum_progarmming_skills_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'
rearranged_sum_progarmming_skills_df

"""**Bar chart**: Display the distribution of required programming skills by each language"""

plt.figure(figsize=(10, 3))
for i, value in enumerate(rearranged_sum_progarmming_skills_df['Number of jobs']):
    plt.text(i, value + 1, f"{value}\n({value*100/len(individual_skill_counts_df):.2f} %)", ha='center', va='bottom', fontsize=9)

plt.bar(rearranged_sum_progarmming_skills_df.index, rearranged_sum_progarmming_skills_df['Number of jobs'], align='center')
plt.xlabel('Programming skills', fontsize=12)
plt.ylabel('Number of jobs', fontsize=12)
plt.ylim(0, 600)
plt.title('Distribution of Required Programming Skills for Data Analyst Jobs', fontsize=12)
plt.show()

"""### 4.4 Database Management System category"""

# Get the dataframe containing the number of jobs required database management system skills
sum_dbms_skills = {}
for skill in skill_categories['Database Management System']:
    sum_dbms_skills[skill] = sum_skills[skill]
sum_dbms_skills_df = pd.DataFrame(sum_dbms_skills, index=['Number of jobs'])
rearranged_sum_dbms_skills_df = sum_dbms_skills_df.T.sort_values(by='Number of jobs', ascending=False)
rearranged_sum_dbms_skills_df['%Percentage (1,004 jobs)'] = (rearranged_sum_dbms_skills_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'
rearranged_sum_dbms_skills_df

"""**Bar chart**: Display the distribution of required Database Management System skills"""

plt.figure(figsize=(10, 3))
for i, value in enumerate(rearranged_sum_dbms_skills_df['Number of jobs']):
    plt.text(i, value + 1, f"{value}\n({value*100/len(individual_skill_counts_df):.2f} %)", ha='center', va='bottom', fontsize=9)

plt.bar(rearranged_sum_dbms_skills_df.index, rearranged_sum_dbms_skills_df['Number of jobs'], align='center')
plt.xlabel('Database Management System skills', fontsize=12)
plt.ylabel('Number of jobs', fontsize=12)
plt.ylim(0, 600)
plt.title('Distribution of Required Database Management System Skills for Data Analyst Jobs', fontsize=12)
plt.show()

"""### 4.5 Statistics for Data Analysis category"""

# Get the dataframe containing the number of jobs required statistics for data analysis skills
sum_stat_skills = {}
for skill in skill_categories['Statistics for Data Analysis']:
    sum_stat_skills[skill] = sum_skills[skill]
sum_stat_skills_df = pd.DataFrame(sum_stat_skills, index=['Number of jobs'])
rearranged_sum_stat_skills_df = sum_stat_skills_df.T.sort_values(by='Number of jobs', ascending=False)
rearranged_sum_stat_skills_df['%Percentage (1,004 jobs)'] = (rearranged_sum_stat_skills_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'
rearranged_sum_stat_skills_df

"""**Bar chart**: Display the distribution of required Statistics for Data Analysis skills"""

plt.figure(figsize=(10, 3))
plt.xticks(rotation=45, ha='right')
for i, value in enumerate(rearranged_sum_stat_skills_df['Number of jobs']):
    plt.text(i, value + 1, f"{value}\n({value*100/len(individual_skill_counts_df):.2f} %)", ha='center', va='bottom', fontsize=9)

plt.bar(rearranged_sum_stat_skills_df.index, rearranged_sum_stat_skills_df['Number of jobs'], align='center')
plt.xlabel('Statistics for Data Analysis skills', fontsize=12)
plt.ylabel('Number of jobs', fontsize=12)
plt.ylim(0, 600)
plt.title('Distribution of Required Statistics for Data Analysis Skills for Data Analysis Jobs', fontsize=12)
plt.show()

"""### 4.6 Data Visualization category


"""

# Get the dataframe containing the number of jobs required data visualization skills
sum_datavis_skills = {}
for skill in skill_categories['Data Visualization']:
    sum_datavis_skills[skill] = sum_skills[skill]
sum_datavis_skills_df = pd.DataFrame(sum_datavis_skills, index=['Number of jobs'])
rearranged_sum_datavis_skills_df = sum_datavis_skills_df.T.sort_values(by='Number of jobs', ascending=False)
rearranged_sum_datavis_skills_df['%Percentage (1,004 jobs)'] = (rearranged_sum_datavis_skills_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'
rearranged_sum_datavis_skills_df

"""**Bar chart**: Display the distribution of required Data Visualization skills"""

plt.figure(figsize=(10, 3))
plt.xticks(rotation=45, ha='right')
for i, value in enumerate(rearranged_sum_datavis_skills_df['Number of jobs']):
    plt.text(i, value + 1, f"{value}\n({value*100/len(individual_skill_counts_df):.2f} %)", ha='center', va='bottom', fontsize=9)

plt.bar(rearranged_sum_datavis_skills_df.index, rearranged_sum_datavis_skills_df['Number of jobs'], align='center')
plt.xlabel('Data visualization skills', fontsize=12)
plt.ylabel('Number of jobs', fontsize=12)
plt.ylim(0, 600)
plt.title('Distribution of Required Data Visualization Skills for Data Analysis Jobs', fontsize=12)
plt.show()

"""### 4.7 Machine Learning category"""

# Get the dataframe containing the number of jobs required machine learning skills
sum_ml_skills = {}
for skill in skill_categories['Machine Learning']:
    sum_ml_skills[skill] = sum_skills[skill]
sum_ml_skills_df = pd.DataFrame(sum_ml_skills, index=['Number of jobs'])
rearranged_sum_ml_skills_df = sum_ml_skills_df.T.sort_values(by='Number of jobs', ascending=False)
rearranged_sum_ml_skills_df['%Percentage (1,004 jobs)'] = (rearranged_sum_ml_skills_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'
rearranged_sum_ml_skills_df

"""**Bar chart**: Display the distribution of required Machine Learning skills"""

plt.figure(figsize=(10, 3))
plt.xticks(rotation=45, ha='right')
for i, value in enumerate(rearranged_sum_ml_skills_df['Number of jobs']):
    plt.text(i, value + 1, f"{value}\n({value*100/len(individual_skill_counts_df):.2f} %)", ha='center', va='bottom', fontsize=9)

plt.bar(rearranged_sum_ml_skills_df.index, rearranged_sum_ml_skills_df['Number of jobs'], align='center')
plt.xlabel('Machine learning skills', fontsize=12)
plt.ylabel('Number of jobs', fontsize=12)
plt.ylim(0, 70)
plt.title('Distribution of Required Machine Learning Skills for Data Analyst Jobs', fontsize=12)
plt.show()

"""### 4.8 Big Data Technologies category"""

# Get the dataframe containing the number of jobs required big data technologies skills
sum_bigdata_skills = {}
for skill in skill_categories['Big Data Technologies']:
    sum_bigdata_skills[skill] = sum_skills[skill]
sum_bigdata_skills_df = pd.DataFrame(sum_bigdata_skills, index=['Number of jobs'])
rearranged_sum_bigdata_skills_df = sum_bigdata_skills_df.T.sort_values(by='Number of jobs', ascending=False)
rearranged_sum_bigdata_skills_df['%Percentage (1,004 jobs)'] = (rearranged_sum_bigdata_skills_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'
rearranged_sum_bigdata_skills_df

"""**Bar chart**: Display the distribution of required Big Data Technologies skills"""

plt.figure(figsize=(10, 3))
for i, value in enumerate(rearranged_sum_bigdata_skills_df['Number of jobs']):
    plt.text(i, value + 1, f"{value}\n({value*100/len(individual_skill_counts_df):.2f} %)", ha='center', va='bottom', fontsize=9)

plt.bar(rearranged_sum_bigdata_skills_df.index, rearranged_sum_bigdata_skills_df['Number of jobs'], align='center')
plt.xlabel('Big data technologies skills', fontsize=12)
plt.ylabel('Number of jobs', fontsize=12)
plt.ylim(0, 150)
plt.title('Distribution of Required Big Data Technologies Skills for Data Analyst Jobs', fontsize=12)
plt.show()

"""### 4.9 Enterprise system category"""

# Get the dataframe containing the number of jobs required enterprise system skills
sum_entsys_skills = {}
for skill in skill_categories['Enterprise system']:
    sum_entsys_skills[skill] = sum_skills[skill]
sum_entsys_skills_df = pd.DataFrame(sum_entsys_skills, index=['Number of jobs'])
rearranged_sum_entsys_skills_df = sum_entsys_skills_df.T.sort_values(by='Number of jobs', ascending=False)
rearranged_sum_entsys_skills_df['%Percentage (1,004 jobs)'] = (rearranged_sum_entsys_skills_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'
rearranged_sum_entsys_skills_df

"""**Bar chart**: Display the distribution of required Enterprise system skills"""

plt.figure(figsize=(10, 3))
for i, value in enumerate(rearranged_sum_entsys_skills_df['Number of jobs']):
    plt.text(i, value + 1, f"{value}\n({value*100/len(individual_skill_counts_df):.2f} %)", ha='center', va='bottom', fontsize=9)

plt.bar(rearranged_sum_entsys_skills_df.index, rearranged_sum_entsys_skills_df['Number of jobs'], align='center')
plt.xlabel('Enterprise system skills', fontsize=12)
plt.ylabel('Number of jobs', fontsize=12)
plt.ylim(0, 100)
plt.title('Distribution of Required Enterprise System Skills for Data Analyst Jobs', fontsize=12)
plt.show()

"""### 4.10 Communication category"""

# Get the dataframe containing the number of jobs required communication skills
sum_comm_skills = {}
for skill in skill_categories['Communication']:
    sum_comm_skills[skill] = sum_skills[skill]
sum_comm_skills_df = pd.DataFrame(sum_comm_skills, index=['Number of jobs'])
rearranged_sum_comm_skills_df = sum_comm_skills_df.T.sort_values(by='Number of jobs', ascending=False)
rearranged_sum_comm_skills_df['%Percentage (1,004 jobs)'] = (rearranged_sum_comm_skills_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'
rearranged_sum_comm_skills_df

"""**Bar chart**: Display the distribution of required Communication skills"""

plt.figure(figsize=(10, 3))
for i, value in enumerate(rearranged_sum_comm_skills_df['Number of jobs']):
    plt.text(i, value + 1, f"{value}\n({value*100/len(individual_skill_counts_df):.2f} %)", ha='center', va='bottom', fontsize=9)

plt.bar(rearranged_sum_comm_skills_df.index, rearranged_sum_comm_skills_df['Number of jobs'], align='center')
plt.xlabel('Communication skills', fontsize=12)
plt.ylabel('Number of jobs', fontsize=12)
plt.ylim(0, 1000)
plt.title('Distribution of Required Communication Skills for Data Analyst Jobs', fontsize=12)
plt.show()

"""### 4.11 Employee attributes category"""

# Get the dataframe containing the number of jobs required employee attributes skills
sum_empatt_skills = {}
for skill in skill_categories['Employee attributes']:
    sum_empatt_skills[skill] = sum_skills[skill]
sum_empatt_skills_df = pd.DataFrame(sum_empatt_skills, index=['Number of jobs'])
rearranged_sum_empatt_skills_df = sum_empatt_skills_df.T.sort_values(by='Number of jobs', ascending=False)
rearranged_sum_empatt_skills_df['%Percentage (1,004 jobs)'] = (rearranged_sum_empatt_skills_df['Number of jobs']*100/len(individual_skill_counts_df)).round(2).astype(str) + '%'
rearranged_sum_empatt_skills_df

"""**Bar chart**: Display the distribution of required Employee attributes skills"""

plt.figure(figsize=(10, 3))
plt.xticks(rotation=45, ha='right')
for i, value in enumerate(rearranged_sum_empatt_skills_df['Number of jobs']):
    plt.text(i, value + 1, f"{value}\n({value*100/len(individual_skill_counts_df):.2f} %)", ha='center', va='bottom', fontsize=9)

plt.bar(rearranged_sum_empatt_skills_df.index, rearranged_sum_empatt_skills_df['Number of jobs'], align='center')
plt.xlabel('Employee attributes skills', fontsize=12)
plt.ylabel('Number of jobs', fontsize=12)
plt.ylim(0, 800)
plt.title('Distribution of Required Employee Attributes Skills for Data Analysts Jobs', fontsize=12)
plt.show()