End to End Machine Learning Project on Student Performance Dataset

Visualizations
An End to End ML project

Installing Libraries

Code
# %%capture
# #!pip install squarify geopandas plotly Ipython ipywidget dash

Importing Libraries

::: {.cell _cell_guid=‘b1076dfc-b9ad-4769-8c92-a6c4dae69d19’ _uuid=‘8f2839f25d086af736a60e9eeb907d3b93b6e0e5’ tags=‘[]’ execution_count=1}

Code
import random
import squarify
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style('white')
import geopandas as gpd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go
from IPython.display import HTML
from ipywidgets import interact, Dropdown
# from googlesearch import search

:::

Importing the Dataset and Processing a Bit

Code
# data = pd.read_csv("/kaggle/input/artists-locations-spotify-cleaned/spotify_with_locations.csv", index_col=0)
data = pd.read_csv("./files/spotify_with_locations.csv", index_col=0)
data.shape
(225456, 28)
Code
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 225456 entries, 0 to 225455
Data columns (total 28 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   acousticness      225456 non-null  float64
 1   artists           225444 non-null  object 
 2   danceability      225456 non-null  float64
 3   duration_ms       225456 non-null  int64  
 4   energy            225456 non-null  float64
 5   explicit          225456 non-null  int64  
 6   id                225456 non-null  object 
 7   instrumentalness  225456 non-null  float64
 8   key               225456 non-null  int64  
 9   liveness          225456 non-null  float64
 10  loudness          225456 non-null  float64
 11  mode              225456 non-null  int64  
 12  name              225456 non-null  object 
 13  popularity        225456 non-null  int64  
 14  release_date      225456 non-null  object 
 15  speechiness       225456 non-null  float64
 16  tempo             225456 non-null  float64
 17  valence           225456 non-null  float64
 18  year              225456 non-null  int64  
 19  mbid              181952 non-null  object 
 20  artist_lastfm     177791 non-null  object 
 21  country_mb        171605 non-null  object 
 22  country_lastfm    132978 non-null  object 
 23  tags_mb           142502 non-null  object 
 24  tags_lastfm       160841 non-null  object 
 25  listeners_lastfm  177791 non-null  float64
 26  scrobbles_lastfm  177791 non-null  float64
 27  ambiguous_artist  181952 non-null  object 
dtypes: float64(11), int64(6), object(11)
memory usage: 49.9+ MB
Code
# How much of the data is null
data.isnull().sum()
acousticness            0
artists                12
danceability            0
duration_ms             0
energy                  0
explicit                0
id                      0
instrumentalness        0
key                     0
liveness                0
loudness                0
mode                    0
name                    0
popularity              0
release_date            0
speechiness             0
tempo                   0
valence                 0
year                    0
mbid                43504
artist_lastfm       47665
country_mb          53851
country_lastfm      92478
tags_mb             82954
tags_lastfm         64615
listeners_lastfm    47665
scrobbles_lastfm    47665
ambiguous_artist    43504
dtype: int64
Code
# Removing columns which are not useful
data = data.drop(['mbid','country_lastfm', 'tags_mb', 'listeners_lastfm','scrobbles_lastfm','id', 'ambiguous_artist','artist_lastfm'], axis = 1)
data = data.rename(columns = {'country_mb':'artist_origin','tags_lastfm':'genres'})
Code
# Now checking the null values and dropping the rows with null values in any of the column
df_missing = data.isnull().sum(axis = 0).reset_index()
df_missing.columns = ['column_name', 'missing_count']
df_missing['missing_ratio'] = df_missing['missing_count']/data.shape[0]
df_missing.query('missing_ratio>0').sort_values(by = 'missing_ratio', ascending = False)
column_name missing_count missing_ratio
19 genres 64615 0.286597
18 artist_origin 53851 0.238854
1 artists 12 0.000053
Code
df_cleaned = data.dropna(subset = ['artist_origin','artists','genres'],axis = 0)
df_cleaned.isna().sum(axis = 0)
acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
valence             0
year                0
artist_origin       0
genres              0
dtype: int64
Code
# Creating a copy of data to work on
df = df_cleaned.copy()
df.shape
(156512, 20)
Code
# Doing some processing, like choosing year as discussed
min_year = 1979
df = df.query("year > @min_year").copy()
# Convert duration_ms to duration in seconds
df['duration'] = df['duration_ms'] / 1000

# Drop the duration_ms column
df.drop('duration_ms', axis=1, inplace=True)
df.shape
(73887, 20)
Code
df.describe()
acousticness danceability energy explicit instrumentalness key liveness loudness mode popularity speechiness tempo valence year duration
count 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000
mean 0.270194 0.572947 0.623204 0.162004 0.094045 5.261115 0.200284 -8.982739 0.683733 47.005820 0.088800 120.083159 0.520840 1999.056816 247.873261
std 0.304551 0.178310 0.244244 0.368457 0.241324 3.562242 0.175176 5.129790 0.465022 12.571747 0.101303 30.003903 0.254652 11.332604 88.925993
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -60.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1980.000000 30.080000
25% 0.020500 0.456000 0.462500 0.000000 0.000000 2.000000 0.093100 -11.216000 0.000000 38.000000 0.034500 96.013500 0.318000 1989.000000 200.095500
50% 0.132000 0.587000 0.661000 0.000000 0.000024 5.000000 0.130000 -7.688000 1.000000 46.000000 0.046600 117.956000 0.526000 1999.000000 236.253000
75% 0.458000 0.705000 0.822000 0.000000 0.006290 8.000000 0.258000 -5.440000 1.000000 55.000000 0.090500 139.557000 0.729000 2009.000000 279.933000
max 0.996000 0.986000 1.000000 1.000000 0.999000 11.000000 0.998000 1.342000 1.000000 100.000000 0.960000 244.091000 0.998000 2020.000000 3816.373000
Code
dtypes = {
    'acousticness': 'float32',
    'danceability': 'float32',
    'energy': 'float32',
    'explicit': 'bool',
    'instrumentalness': 'float32',
    'key': 'uint8',
    'liveness': 'float32',
    'loudness': 'float32',
    'mode': 'bool',
    'popularity': 'float32',
    'speechiness': 'float32',
    'tempo': 'float32',
    'valence': 'float32',
    'year': 'uint16',
    'duration': 'float32',
    'artists': 'string',
    'name': 'string',
    'release_date': 'string',
    'genres': 'string',
    'artist_origin': 'string'
}
df = df.astype(dtypes) #Changing the dtypes for efficient processing.
Code
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 73887 entries, 5920 to 225454
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   acousticness      73887 non-null  float32
 1   artists           73887 non-null  string 
 2   danceability      73887 non-null  float32
 3   energy            73887 non-null  float32
 4   explicit          73887 non-null  bool   
 5   instrumentalness  73887 non-null  float32
 6   key               73887 non-null  uint8  
 7   liveness          73887 non-null  float32
 8   loudness          73887 non-null  float32
 9   mode              73887 non-null  bool   
 10  name              73887 non-null  string 
 11  popularity        73887 non-null  float32
 12  release_date      73887 non-null  string 
 13  speechiness       73887 non-null  float32
 14  tempo             73887 non-null  float32
 15  valence           73887 non-null  float32
 16  year              73887 non-null  uint16 
 17  artist_origin     73887 non-null  string 
 18  genres            73887 non-null  string 
 19  duration          73887 non-null  float32
dtypes: bool(2), float32(11), string(5), uint16(1), uint8(1)
memory usage: 6.8 MB
Code
# df.to_csv("PowerBI_Spotify_Cleaned_data",index = 0)
Code
# grouped_df = df.groupby("name", sort=False).apply(lambda x: x.reset_index(drop=True))
# grouped_df[grouped_df['artists'] == 'Sade']
  • Observing this we can see that the same song is released in multiple years maybe by different producers or in different albums.
  • So we will group any feature by songs and take the mean value of the feature before plotting

Plot 1: Distribution of Artist Origins across years

Context

  • Most artists come from which country for a given year or all the years

Caution

  • For any year, all countries who have less than 1% of total origins are grouped together into others category
Code
def treemap(year):
    if year != 'All':
        data = df.query('year == @year')
        title = f'Artist Origins Distribution for {year} - Treemap (Thresholded)'
    else:
        data = df
        min_year = np.min(df['year'])
        max_year = np.max(df['year'])
        title = f'Artist Origins Distribution from {min_year} to {max_year} - Treemap (Thresholded)'

    # Set the threshold count
    threshold = 0.01 * data.shape[0]

    # Get the counts of artist origins
    artist_counts = data['artist_origin'].value_counts()

    # Identify countries with counts less than the threshold
    other_countries = artist_counts[artist_counts < threshold]

    # Sum the counts of other countries
    other_count = other_countries.sum()

    # Exclude other countries from the main data
    filtered_counts = artist_counts.drop(other_countries.index)

    # Add the other count as a separate entry
    filtered_counts['Other'] = other_count

    # Calculate the total size
    total_size = filtered_counts.sum()

    # Create the treemap
    plt.figure(figsize=(10, 6))
    squarify.plot(sizes=filtered_counts, label=filtered_counts.index, alpha=0.8, color=sns.color_palette("muted"))
    plt.axis('off')

    # Add percentage labels
    for i, patch in enumerate(plt.gca().patches):
        x, y, dx, dy = patch.get_bbox().bounds
        label = filtered_counts.index[i]
        size = filtered_counts[label]
        percentage = (size / total_size) * 100
        plt.gca().text(x + dx / 2, y + dy / 2, f'\n\n{percentage:.1f}%', ha='center', va='center')
    
    plt.title(title)
    plt.savefig(title,bbox_inches = 'tight' ,dpi=150)
    plt.show()

years = ['All'] + df['year'].unique().tolist()

# Create the interactive plot
interact(treemap, year=years)
plt.show()

Plot 1 Alternative

Code
# Read the data into a pandas DataFrame
# data = df  # Replace 'your_data_file.csv' with your actual data file path

# Group the data by artist origin and count the number of artists from each region
artist_counts = df['artist_origin'].value_counts()

# Normalize the counts by dividing by the maximum count value
# normalized_counts = artist_counts / artist_counts.max()

# Create the choropleth map using plotly.graph_objects
fig = go.Figure(data=go.Choropleth(
    locations=artist_counts.index,
    z=artist_counts.values,
    locationmode='country names',
    colorscale='ylorrd',
    reversescale=False,  # Reverse the colorscale to have darker blue for higher counts
#     zmin=0,  # Set the minimum value for the color scale
#     zmax=1,  # Set the maximum value for the color scale
))

# Set the layout properties
fig.update_layout(
    title_text='Artist Origins Count by Country from 1980 to 2020',
    height = 1200,
    width = 1500,
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
)

# Show the figure
fig.show()
plt.show()

Plot 3 High Feature Valued Songs distribution across Years and Origins classified wrt Genres

Context

  • Identifying top 5 songs with high value for a particular feature across countries and time frame
  • We can even identify what are the genres that have most high feature values across years and time frames

Caution:

  • We get the values for the features by aggregation, since songs are published multiple times maybe in different playlists
  • For few countries we don’t get top 5 since the data is not available
Code
def create_plot(year, country, feature, top_n=5):
        # Filter the data for the selected feature then for countries and years
    data_dummy = df.copy()
    data_dummy = data_dummy.loc[:,['name',feature,'artist_origin','year','genres']]
    
    if country != 'All':
        if year !='All':
            filtered_data = data_dummy[(data_dummy['year'] == year) & (data_dummy['artist_origin'] == country)]
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            title = f'Top {feature.capitalize()} {country} Songs in Year {year}'
        else:
            filtered_data = data_dummy[data_dummy['artist_origin'] == country]
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            min_year = np.min(filtered_data.year)
            max_year = np.max(filtered_data.year)
            title = f'Top {feature.capitalize()} Songs from {country} across {min_year} - {max_year}'
    else:
        if year!='All':
            filtered_data = data_dummy[data_dummy['year'] == year]
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            title = f'Top {feature.capitalize()} Songs from all countries in Year {year}'
        else:
            filtered_data = data_dummy
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            min_year = np.min(filtered_data.year)
            max_year = np.max(filtered_data.year)
            title = f'Top {feature.capitalize()} Songs from all countries across {min_year} - {max_year}'
    
    # Group the data by artists and calculate the mean popularity per artist
    grouped = filtered_data.groupby(['name','genres'])[feature].mean().reset_index()
    
    # Only taking unique song names 
    grouped = grouped.drop_duplicates(subset='name')
    
    # Group the data by name of songs and calculate the mean popularity per song
    sorted_data = grouped.sort_values(by=feature, ascending=False).head(top_n)
    
    # Only using first 2 genres for songs
    sorted_data.genres = sorted_data.genres.apply(lambda x: ';'.join(x.split(';')[0:2]))
    # ... existing code ...

    # Create the bar plot using Seaborn
    plt.figure(figsize=(12, 8))

    # Calculate the bar width
    bar_width = 3/sorted_data.shape[0]

    sns.barplot(
        data=sorted_data,
        x='name',
        y=feature,
        palette='muted',
        hue='genres',
        width=bar_width  # Set the bar width
    )
    
    
    for i, bar in enumerate(plt.gca().patches):
        if i < top_n:
            mean_value = sorted_data.iloc[i, 2]
            song_name = sorted_data.iloc[i, 0]
            origin = df.loc[df['name'] == song_name, 'artist_origin'].iloc[0]
        if country == 'All':
            plt.gca().text(
                bar.get_x() + bar.get_width() / 2,
                bar.get_height() / 2,
                f'{mean_value:.2f} in {origin}',
                ha='center',
                va='center',
                rotation=90
            )
        else:
            plt.gca().text(
                bar.get_x() + bar.get_width() / 2,
                bar.get_height() / 2,
                f'{mean_value:.2f}',
                ha='center',
                va='center'
            )

    # ... existing code ...

    # Customize the x-axis ticks
    xticks = np.arange(len(sorted_data)) #+ (bar_width/2.7)  # Add half of the bar width
    plt.xticks(xticks, sorted_data['name'], rotation=36)
    plt.gca().xaxis.set_tick_params(width=1)  # Set the tick width

    # ... existing code ...
    plt.xlabel('Name of Songs')
    plt.ylabel(f'{feature.capitalize()}')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(title, bbox_inches='tight', dpi=150)
    plt.show()


# Get the unique years and countries in the data
years = ['All'] + df['year'].unique().tolist()
countries = sorted(['All'] + df['artist_origin'].unique().tolist())
features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']
tops = [5,10,15]
# Removed Explicit: Whether or not the track has explicit lyrics (true = yes it does; false = no it does not OR unknown); Not that important
# Removed Year as it's not a feature for a song

# Create the interactive plot
interact(create_plot, year=years,country=countries, feature = features, top_n=tops)
plt.show()

Plot 4: Distribution of Features across countries

  • Able to answer questions such as which countries has most danceable songs, most loud songs and so on in a given time frame.
Code
features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']

# Define a function to update the plot based on the selected feature
def update_plot(feature):
    # Group the data by artist origin and calculate the mean of the selected feature per country
    feature_means = df.groupby('artist_origin')[feature].mean()
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    # Merge the data with the world shapefile based on the country names
    merged_data = world.merge(feature_means, left_on='name', right_index=True)

    # Create a choropleth map using plotly
    fig = go.Figure(data=go.Choropleth(
        locations=merged_data['iso_a3'],
        z=merged_data[feature],
        text=merged_data['name'],
        colorscale='Viridis',
        autocolorscale=False,
        marker_line_color='white',
        marker_line_width=0.5,
        colorbar_title=f'{feature.capitalize()}'
    ))

    # Customize the layout
    fig.update_layout(
        title_text=f'{feature.capitalize()} Distribution by Country',
        height = 1200,
        width = 1500,
        geo=dict(
            showframe=False,
            showcoastlines=False,
            projection_type='equirectangular'
        )
    )

    # Show the plot
    fig.show()

features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']
# Use interact to create the interactive dropdown widget
interact(update_plot, feature=Dropdown(options=features))
plt.show()

Rest Good Plots

Code
# The distribution of features over the years by aggregating them detailed of this is the plot below this

# Define the list of features
features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
            'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
            'tempo', 'valence']

# Function to create the interactive histogram plot
def plot_histogram(feature):
    # Create the histogram plot using seaborn
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x=feature, bins=30, kde=True, hue='year', multiple='stack', palette='viridis')
    plt.title(f'{feature.capitalize()} Distribution by Year')
    plt.xlabel('Mean Value')
    plt.ylabel('Frequency')
    plt.legend(title='Year')
    plt.show()

# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
plt.show()
Code
# Read the data into a pandas DataFrame
# data = df  # Replace 'your_data_file.csv' with your actual data file path

# Define the list of features
features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
            'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
            'tempo', 'valence']

# Function to create the interactive histogram plot
def plot_histogram(feature):
    # Create a subplot for each year
    fig, ax = plt.subplots(figsize=(24, 9), dpi = 120)
    
    # Iterate over each year and create the histogram for the selected feature
    for year, group in df.groupby('year'):
        sns.histplot(data=group, x=feature, bins=30, kde=True, ax=ax, label=str(year))
    
    # Set the plot title and labels
    ax.set_title(f'{feature.capitalize()} Distribution by Year')
    ax.set_xlabel('Mean Value')
    ax.set_ylabel('Frequency')
    
    # Set the legend inside the plot with 8 columns
    legend = ax.legend(title='Year', bbox_to_anchor=(1, 1), ncol=8)
    
    # Adjust the plot to accommodate the legend
    plt.subplots_adjust(right=0.85)
    
    # Show the plot
    plt.show()

# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
plt.show()
Code
# Read the data into a pandas DataFrame
# data = df  # Replace 'your_data_file.csv' with your actual data file path

# Define the list of features
features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
            'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
            'tempo', 'valence']

# Define the number of light colors to select
num_colors = len(features)

desired_colors = ['tab:purple', 'tab:orange', 'tab:green', 'tab:blue']

# Function to create the interactive histogram plot
def plot_histogram(feature):
    # Select random light colors from 'tab10' palette
#     palette = random.sample(sns.color_palette('tab10', 10)[:5], 1)

    # Group the data by year and calculate the mean of the selected feature per year
    feature_means = df.groupby('year')[feature].mean()
    
    # Select a random light color from the palette
    color = random.choice(desired_colors)
    
    fig, ax = plt.subplots(1,1, figsize = (20,9),dpi = 120)
    
    # Create the bar plot for the means using seaborn
    ax = sns.barplot(x=feature_means.index, y=feature_means, color=color)
    
    # Set the plot title and labels
    ax.set_title(f'Mean {feature.capitalize()} by Year')
    ax.set_xlabel('Year')
    ax.set_ylabel('Mean Value')
    
    # Rotate and evenly space the year labels on the x-axis
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    ax.xaxis.set_tick_params(pad=10)
    
    # Show the plot
    plt.tight_layout()
    plt.show()

# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
plt.show()

Rough Plots

Code
# ac = df.copy()
# # ac[['genre_1', 'genre_2', 'genre_3','rest_genres']] = ac.genres.str.split(';', n=3, expand=True)
Code
# ac.isnull().sum()
Code
# import geopandas as gpd
# import pandas as pd
# import plotly.graph_objects as go
# from ipywidgets import interact, Dropdown

# # Read the world shapefile
# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# # Read the data containing country and genre information
# data = ac

# # Define the function to update the plot based on the selected year
# def update_plot(year):
#     # Filter the data for the selected year
#     filtered_data = data[data['year'] == year]

#     # Calculate the genre distribution by country
#     genre_distribution = filtered_data.groupby('artist_origin')['genres'].value_counts().unstack().reset_index().fillna(0)

#     # Merge the data with the world shapefile based on the country names
#     merged_data = world.merge(genre_distribution, left_on='name', right_on='artist_origin')

#     # Create a stacked bar plot using plotly
#     fig = go.Figure()

#     for genre in genre_distribution.columns[1:]:
#         fig.add_trace(go.Choropleth(
#             locations=merged_data['iso_a3'],
#             z=merged_data[genre],
#             text=merged_data['name'],
#             colorscale='Viridis',
#             autocolorscale=False,
#             marker_line_color='white',
#             marker_line_width=0.5,
#             colorbar_title='Count',
#             name=genre
#         ))

#     # Customize the layout
#     fig.update_layout(
#         title_text=f'Genre Distribution by Country in {year}',
#         height=1200,
#         width=1500,
#         barmode='stack',
#         geo=dict(
#             showframe=False,
#             showcoastlines=False,
#             projection_type='equirectangular'
#         )
#     )

#     # Show the plot
#     fig.show()

# # Get the unique years from the data
# years = data['year'].unique().tolist()

# # Use interact to create the interactive dropdown widget
# interact(update_plot, year=Dropdown(options=years))
# plt.show()