End to End Machine Learning Project on Student Performance Dataset

Visualizations

An End to End ML project

Installing Libraries

Code

# %%capture
# #!pip install squarify geopandas plotly Ipython ipywidget dash

Importing Libraries

::: {.cell _cell_guid=‘b1076dfc-b9ad-4769-8c92-a6c4dae69d19’ _uuid=‘8f2839f25d086af736a60e9eeb907d3b93b6e0e5’ tags=‘[]’ execution_count=1}

Code

import random
import squarify
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style('white')
import geopandas as gpd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go
from IPython.display import HTML
from ipywidgets import interact, Dropdown
# from googlesearch import search

:::

Importing the Dataset and Processing a Bit

Code

# data = pd.read_csv("/kaggle/input/artists-locations-spotify-cleaned/spotify_with_locations.csv", index_col=0)
data = pd.read_csv("./files/spotify_with_locations.csv", index_col=0)
data.shape

(225456, 28)

Code

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 225456 entries, 0 to 225455
Data columns (total 28 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   acousticness      225456 non-null  float64
 1   artists           225444 non-null  object 
 2   danceability      225456 non-null  float64
 3   duration_ms       225456 non-null  int64  
 4   energy            225456 non-null  float64
 5   explicit          225456 non-null  int64  
 6   id                225456 non-null  object 
 7   instrumentalness  225456 non-null  float64
 8   key               225456 non-null  int64  
 9   liveness          225456 non-null  float64
 10  loudness          225456 non-null  float64
 11  mode              225456 non-null  int64  
 12  name              225456 non-null  object 
 13  popularity        225456 non-null  int64  
 14  release_date      225456 non-null  object 
 15  speechiness       225456 non-null  float64
 16  tempo             225456 non-null  float64
 17  valence           225456 non-null  float64
 18  year              225456 non-null  int64  
 19  mbid              181952 non-null  object 
 20  artist_lastfm     177791 non-null  object 
 21  country_mb        171605 non-null  object 
 22  country_lastfm    132978 non-null  object 
 23  tags_mb           142502 non-null  object 
 24  tags_lastfm       160841 non-null  object 
 25  listeners_lastfm  177791 non-null  float64
 26  scrobbles_lastfm  177791 non-null  float64
 27  ambiguous_artist  181952 non-null  object 
dtypes: float64(11), int64(6), object(11)
memory usage: 49.9+ MB

Code

# How much of the data is null
data.isnull().sum()

acousticness            0
artists                12
danceability            0
duration_ms             0
energy                  0
explicit                0
id                      0
instrumentalness        0
key                     0
liveness                0
loudness                0
mode                    0
name                    0
popularity              0
release_date            0
speechiness             0
tempo                   0
valence                 0
year                    0
mbid                43504
artist_lastfm       47665
country_mb          53851
country_lastfm      92478
tags_mb             82954
tags_lastfm         64615
listeners_lastfm    47665
scrobbles_lastfm    47665
ambiguous_artist    43504
dtype: int64

Code

# Removing columns which are not useful
data = data.drop(['mbid','country_lastfm', 'tags_mb', 'listeners_lastfm','scrobbles_lastfm','id', 'ambiguous_artist','artist_lastfm'], axis = 1)
data = data.rename(columns = {'country_mb':'artist_origin','tags_lastfm':'genres'})

Code

# Now checking the null values and dropping the rows with null values in any of the column
df_missing = data.isnull().sum(axis = 0).reset_index()
df_missing.columns = ['column_name', 'missing_count']
df_missing['missing_ratio'] = df_missing['missing_count']/data.shape[0]
df_missing.query('missing_ratio>0').sort_values(by = 'missing_ratio', ascending = False)

	column_name	missing_count	missing_ratio
19	genres	64615	0.286597
18	artist_origin	53851	0.238854
1	artists	12	0.000053

Code

df_cleaned = data.dropna(subset = ['artist_origin','artists','genres'],axis = 0)
df_cleaned.isna().sum(axis = 0)

acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
valence             0
year                0
artist_origin       0
genres              0
dtype: int64

Code

# Creating a copy of data to work on
df = df_cleaned.copy()
df.shape

(156512, 20)

Code

# Doing some processing, like choosing year as discussed
min_year = 1979
df = df.query("year > @min_year").copy()
# Convert duration_ms to duration in seconds
df['duration'] = df['duration_ms'] / 1000

# Drop the duration_ms column
df.drop('duration_ms', axis=1, inplace=True)
df.shape

(73887, 20)

Code

df.describe()

	acousticness	danceability	energy	explicit	instrumentalness	key	liveness	loudness	mode	popularity	speechiness	tempo	valence	year	duration
count	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000
mean	0.270194	0.572947	0.623204	0.162004	0.094045	5.261115	0.200284	-8.982739	0.683733	47.005820	0.088800	120.083159	0.520840	1999.056816	247.873261
std	0.304551	0.178310	0.244244	0.368457	0.241324	3.562242	0.175176	5.129790	0.465022	12.571747	0.101303	30.003903	0.254652	11.332604	88.925993
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-60.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1980.000000	30.080000
25%	0.020500	0.456000	0.462500	0.000000	0.000000	2.000000	0.093100	-11.216000	0.000000	38.000000	0.034500	96.013500	0.318000	1989.000000	200.095500
50%	0.132000	0.587000	0.661000	0.000000	0.000024	5.000000	0.130000	-7.688000	1.000000	46.000000	0.046600	117.956000	0.526000	1999.000000	236.253000
75%	0.458000	0.705000	0.822000	0.000000	0.006290	8.000000	0.258000	-5.440000	1.000000	55.000000	0.090500	139.557000	0.729000	2009.000000	279.933000
max	0.996000	0.986000	1.000000	1.000000	0.999000	11.000000	0.998000	1.342000	1.000000	100.000000	0.960000	244.091000	0.998000	2020.000000	3816.373000

Code

dtypes = {
    'acousticness': 'float32',
    'danceability': 'float32',
    'energy': 'float32',
    'explicit': 'bool',
    'instrumentalness': 'float32',
    'key': 'uint8',
    'liveness': 'float32',
    'loudness': 'float32',
    'mode': 'bool',
    'popularity': 'float32',
    'speechiness': 'float32',
    'tempo': 'float32',
    'valence': 'float32',
    'year': 'uint16',
    'duration': 'float32',
    'artists': 'string',
    'name': 'string',
    'release_date': 'string',
    'genres': 'string',
    'artist_origin': 'string'
}
df = df.astype(dtypes) #Changing the dtypes for efficient processing.

Code

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73887 entries, 5920 to 225454
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   acousticness      73887 non-null  float32
 1   artists           73887 non-null  string 
 2   danceability      73887 non-null  float32
 3   energy            73887 non-null  float32
 4   explicit          73887 non-null  bool   
 5   instrumentalness  73887 non-null  float32
 6   key               73887 non-null  uint8  
 7   liveness          73887 non-null  float32
 8   loudness          73887 non-null  float32
 9   mode              73887 non-null  bool   
 10  name              73887 non-null  string 
 11  popularity        73887 non-null  float32
 12  release_date      73887 non-null  string 
 13  speechiness       73887 non-null  float32
 14  tempo             73887 non-null  float32
 15  valence           73887 non-null  float32
 16  year              73887 non-null  uint16 
 17  artist_origin     73887 non-null  string 
 18  genres            73887 non-null  string 
 19  duration          73887 non-null  float32
dtypes: bool(2), float32(11), string(5), uint16(1), uint8(1)
memory usage: 6.8 MB

Code

# df.to_csv("PowerBI_Spotify_Cleaned_data",index = 0)

Code

# grouped_df = df.groupby("name", sort=False).apply(lambda x: x.reset_index(drop=True))
# grouped_df[grouped_df['artists'] == 'Sade']

Observing this we can see that the same song is released in multiple years maybe by different producers or in different albums.
So we will group any feature by songs and take the mean value of the feature before plotting

Plot 1: Distribution of Artist Origins across years

Context

Most artists come from which country for a given year or all the years

Caution

For any year, all countries who have less than 1% of total origins are grouped together into others category

Code

def treemap(year):
    if year != 'All':
        data = df.query('year == @year')
        title = f'Artist Origins Distribution for {year} - Treemap (Thresholded)'
    else:
        data = df
        min_year = np.min(df['year'])
        max_year = np.max(df['year'])
        title = f'Artist Origins Distribution from {min_year} to {max_year} - Treemap (Thresholded)'

    # Set the threshold count
    threshold = 0.01 * data.shape[0]

    # Get the counts of artist origins
    artist_counts = data['artist_origin'].value_counts()

    # Identify countries with counts less than the threshold
    other_countries = artist_counts[artist_counts < threshold]

    # Sum the counts of other countries
    other_count = other_countries.sum()

    # Exclude other countries from the main data
    filtered_counts = artist_counts.drop(other_countries.index)

    # Add the other count as a separate entry
    filtered_counts['Other'] = other_count

    # Calculate the total size
    total_size = filtered_counts.sum()

    # Create the treemap
    plt.figure(figsize=(10, 6))
    squarify.plot(sizes=filtered_counts, label=filtered_counts.index, alpha=0.8, color=sns.color_palette("muted"))
    plt.axis('off')

    # Add percentage labels
    for i, patch in enumerate(plt.gca().patches):
        x, y, dx, dy = patch.get_bbox().bounds
        label = filtered_counts.index[i]
        size = filtered_counts[label]
        percentage = (size / total_size) * 100
        plt.gca().text(x + dx / 2, y + dy / 2, f'\n\n{percentage:.1f}%', ha='center', va='center')
    
    plt.title(title)
    plt.savefig(title,bbox_inches = 'tight' ,dpi=150)
    plt.show()

years = ['All'] + df['year'].unique().tolist()

# Create the interactive plot
interact(treemap, year=years)
plt.show()

Plot 1 Alternative

Code

# Read the data into a pandas DataFrame
# data = df  # Replace 'your_data_file.csv' with your actual data file path

# Group the data by artist origin and count the number of artists from each region
artist_counts = df['artist_origin'].value_counts()

# Normalize the counts by dividing by the maximum count value
# normalized_counts = artist_counts / artist_counts.max()

# Create the choropleth map using plotly.graph_objects
fig = go.Figure(data=go.Choropleth(
    locations=artist_counts.index,
    z=artist_counts.values,
    locationmode='country names',
    colorscale='ylorrd',
    reversescale=False,  # Reverse the colorscale to have darker blue for higher counts
#     zmin=0,  # Set the minimum value for the color scale
#     zmax=1,  # Set the maximum value for the color scale
))

# Set the layout properties
fig.update_layout(
    title_text='Artist Origins Count by Country from 1980 to 2020',
    height = 1200,
    width = 1500,
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
)

# Show the figure
fig.show()
plt.show()

Plot 2: Most Popular Artists for Given Year n Place of Origin

Context of Plot

For a given year what are the origins of most popular artists
Who are the most popular artists in a particular year and place of origin

Caution Points

For few countries in a particular year, there is no data available for ex. Taiwan’s data seems to be available only for few years.
Another point is in some years for a particular country, we have only few artists, so we can even have top 2 or top 3 artist, and in the worst case only a single artist or no artist from a country
Also because of digitization, I think the new artists are most popular since new generation has access to share there likes, with the new artists, and they don’t listen to the old artists that much.

Code

def create_plot(year, country):
    # Filter the data for the selected year and country
    if country != 'All':
        if year !='All':
            filtered_data = df[(df['year'] == year) & (df['artist_origin'] == country)]
            num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
#             print(filtered_data.shape[0])
            title = f'Top {country} Artists in Year {year}'
        else:
            filtered_data = df[df['artist_origin'] == country]
            num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
            min_year = np.min(filtered_data.year)
            max_year = np.max(filtered_data.year)
#             print(filtered_data.shape[0])
            title = f'Top Artists from {country} across {min_year} - {max_year}'
    else:
        if year!='All':
            filtered_data = df[df['year'] == year]
            num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
#             print(filtered_data.shape[0])
            title = f'Top Artists from all countries in Year {year}'
        else:
            filtered_data = df
            num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
            min_year = np.min(filtered_data.year)
            max_year = np.max(filtered_data.year)
#             print(filtered_data.shape[0])
            title = f'Top Artists from all countries across {min_year} - {max_year}'

    # Group the data by artists and calculate the mean popularity per artist
    grouped = filtered_data.groupby('artists')['popularity'].mean().reset_index()

    # Sort the data by popularity in descending order
    sorted_data = grouped.sort_values(by='popularity', ascending=False).head(5)

    # Create the bar plot using Seaborn
    plt.figure(figsize=(12, 8))
    sns.barplot(data=sorted_data, x='artists', y='popularity', palette='muted')
    plt.title(title)
    plt.xlabel('Artists')
    plt.ylabel('Popularity')
    plt.xticks(rotation=45)
    plt.yticks(range(0, int(sorted_data['popularity'].max()) + 6, 5)) # Set y-axis ticks with a fixed increment of 5

    # Add artist origin labels when 'All' is selected
    if country == 'All':
        for i, bar in enumerate(plt.gca().patches):
            artist_name = sorted_data.iloc[i, 0]
            artist_origin = df.loc[df['artists'] == artist_name, 'artist_origin'].iloc[0]
            plt.gca().text(bar.get_x() + bar.get_width() / 2, bar.get_height() / 2, artist_origin, ha='center', va='center', rotation=90)

    plt.tight_layout()
    plt.savefig(title,bbox_inches = 'tight' ,dpi=150)
    plt.show()

# Get the unique years and countries in the data
years = ['All'] + df['year'].unique().tolist()
countries = sorted(['All'] + df['artist_origin'].unique().tolist())

# Create the interactive plot
interact(create_plot, year=years, country=countries)
plt.show()

Plot 3 High Feature Valued Songs distribution across Years and Origins classified wrt Genres

Context

Identifying top 5 songs with high value for a particular feature across countries and time frame
We can even identify what are the genres that have most high feature values across years and time frames

Caution:

We get the values for the features by aggregation, since songs are published multiple times maybe in different playlists
For few countries we don’t get top 5 since the data is not available

Code

def create_plot(year, country, feature, top_n=5):
        # Filter the data for the selected feature then for countries and years
    data_dummy = df.copy()
    data_dummy = data_dummy.loc[:,['name',feature,'artist_origin','year','genres']]
    
    if country != 'All':
        if year !='All':
            filtered_data = data_dummy[(data_dummy['year'] == year) & (data_dummy['artist_origin'] == country)]
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            title = f'Top {feature.capitalize()} {country} Songs in Year {year}'
        else:
            filtered_data = data_dummy[data_dummy['artist_origin'] == country]
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            min_year = np.min(filtered_data.year)
            max_year = np.max(filtered_data.year)
            title = f'Top {feature.capitalize()} Songs from {country} across {min_year} - {max_year}'
    else:
        if year!='All':
            filtered_data = data_dummy[data_dummy['year'] == year]
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            title = f'Top {feature.capitalize()} Songs from all countries in Year {year}'
        else:
            filtered_data = data_dummy
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            min_year = np.min(filtered_data.year)
            max_year = np.max(filtered_data.year)
            title = f'Top {feature.capitalize()} Songs from all countries across {min_year} - {max_year}'
    
    # Group the data by artists and calculate the mean popularity per artist
    grouped = filtered_data.groupby(['name','genres'])[feature].mean().reset_index()
    
    # Only taking unique song names 
    grouped = grouped.drop_duplicates(subset='name')
    
    # Group the data by name of songs and calculate the mean popularity per song
    sorted_data = grouped.sort_values(by=feature, ascending=False).head(top_n)
    
    # Only using first 2 genres for songs
    sorted_data.genres = sorted_data.genres.apply(lambda x: ';'.join(x.split(';')[0:2]))
    # ... existing code ...

    # Create the bar plot using Seaborn
    plt.figure(figsize=(12, 8))

    # Calculate the bar width
    bar_width = 3/sorted_data.shape[0]

    sns.barplot(
        data=sorted_data,
        x='name',
        y=feature,
        palette='muted',
        hue='genres',
        width=bar_width  # Set the bar width
    )
    
    
    for i, bar in enumerate(plt.gca().patches):
        if i < top_n:
            mean_value = sorted_data.iloc[i, 2]
            song_name = sorted_data.iloc[i, 0]
            origin = df.loc[df['name'] == song_name, 'artist_origin'].iloc[0]
        if country == 'All':
            plt.gca().text(
                bar.get_x() + bar.get_width() / 2,
                bar.get_height() / 2,
                f'{mean_value:.2f} in {origin}',
                ha='center',
                va='center',
                rotation=90
            )
        else:
            plt.gca().text(
                bar.get_x() + bar.get_width() / 2,
                bar.get_height() / 2,
                f'{mean_value:.2f}',
                ha='center',
                va='center'
            )

    # ... existing code ...

    # Customize the x-axis ticks
    xticks = np.arange(len(sorted_data)) #+ (bar_width/2.7)  # Add half of the bar width
    plt.xticks(xticks, sorted_data['name'], rotation=36)
    plt.gca().xaxis.set_tick_params(width=1)  # Set the tick width

    # ... existing code ...
    plt.xlabel('Name of Songs')
    plt.ylabel(f'{feature.capitalize()}')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(title, bbox_inches='tight', dpi=150)
    plt.show()


# Get the unique years and countries in the data
years = ['All'] + df['year'].unique().tolist()
countries = sorted(['All'] + df['artist_origin'].unique().tolist())
features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']
tops = [5,10,15]
# Removed Explicit: Whether or not the track has explicit lyrics (true = yes it does; false = no it does not OR unknown); Not that important
# Removed Year as it's not a feature for a song

# Create the interactive plot
interact(create_plot, year=years,country=countries, feature = features, top_n=tops)
plt.show()

Plot 4: Distribution of Features across countries

Able to answer questions such as which countries has most danceable songs, most loud songs and so on in a given time frame.

Code

features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']

# Define a function to update the plot based on the selected feature
def update_plot(feature):
    # Group the data by artist origin and calculate the mean of the selected feature per country
    feature_means = df.groupby('artist_origin')[feature].mean()
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    # Merge the data with the world shapefile based on the country names
    merged_data = world.merge(feature_means, left_on='name', right_index=True)

    # Create a choropleth map using plotly
    fig = go.Figure(data=go.Choropleth(
        locations=merged_data['iso_a3'],
        z=merged_data[feature],
        text=merged_data['name'],
        colorscale='Viridis',
        autocolorscale=False,
        marker_line_color='white',
        marker_line_width=0.5,
        colorbar_title=f'{feature.capitalize()}'
    ))

    # Customize the layout
    fig.update_layout(
        title_text=f'{feature.capitalize()} Distribution by Country',
        height = 1200,
        width = 1500,
        geo=dict(
            showframe=False,
            showcoastlines=False,
            projection_type='equirectangular'
        )
    )

    # Show the plot
    fig.show()

features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']
# Use interact to create the interactive dropdown widget
interact(update_plot, feature=Dropdown(options=features))
plt.show()

Rest Good Plots

Code

# The distribution of features over the years by aggregating them detailed of this is the plot below this

# Define the list of features
features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
            'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
            'tempo', 'valence']

# Function to create the interactive histogram plot
def plot_histogram(feature):
    # Create the histogram plot using seaborn
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x=feature, bins=30, kde=True, hue='year', multiple='stack', palette='viridis')
    plt.title(f'{feature.capitalize()} Distribution by Year')
    plt.xlabel('Mean Value')
    plt.ylabel('Frequency')
    plt.legend(title='Year')
    plt.show()

# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
plt.show()

Code

# Read the data into a pandas DataFrame
# data = df  # Replace 'your_data_file.csv' with your actual data file path

# Define the list of features
features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
            'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
            'tempo', 'valence']

# Function to create the interactive histogram plot
def plot_histogram(feature):
    # Create a subplot for each year
    fig, ax = plt.subplots(figsize=(24, 9), dpi = 120)
    
    # Iterate over each year and create the histogram for the selected feature
    for year, group in df.groupby('year'):
        sns.histplot(data=group, x=feature, bins=30, kde=True, ax=ax, label=str(year))
    
    # Set the plot title and labels
    ax.set_title(f'{feature.capitalize()} Distribution by Year')
    ax.set_xlabel('Mean Value')
    ax.set_ylabel('Frequency')
    
    # Set the legend inside the plot with 8 columns
    legend = ax.legend(title='Year', bbox_to_anchor=(1, 1), ncol=8)
    
    # Adjust the plot to accommodate the legend
    plt.subplots_adjust(right=0.85)
    
    # Show the plot
    plt.show()

# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
plt.show()

Code

# Read the data into a pandas DataFrame
# data = df  # Replace 'your_data_file.csv' with your actual data file path

# Define the list of features
features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
            'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
            'tempo', 'valence']

# Define the number of light colors to select
num_colors = len(features)

desired_colors = ['tab:purple', 'tab:orange', 'tab:green', 'tab:blue']

# Function to create the interactive histogram plot
def plot_histogram(feature):
    # Select random light colors from 'tab10' palette
#     palette = random.sample(sns.color_palette('tab10', 10)[:5], 1)

    # Group the data by year and calculate the mean of the selected feature per year
    feature_means = df.groupby('year')[feature].mean()
    
    # Select a random light color from the palette
    color = random.choice(desired_colors)
    
    fig, ax = plt.subplots(1,1, figsize = (20,9),dpi = 120)
    
    # Create the bar plot for the means using seaborn
    ax = sns.barplot(x=feature_means.index, y=feature_means, color=color)
    
    # Set the plot title and labels
    ax.set_title(f'Mean {feature.capitalize()} by Year')
    ax.set_xlabel('Year')
    ax.set_ylabel('Mean Value')
    
    # Rotate and evenly space the year labels on the x-axis
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    ax.xaxis.set_tick_params(pad=10)
    
    # Show the plot
    plt.tight_layout()
    plt.show()

# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
plt.show()

Rough Plots

Code

# ac = df.copy()
# # ac[['genre_1', 'genre_2', 'genre_3','rest_genres']] = ac.genres.str.split(';', n=3, expand=True)

Code

# ac.isnull().sum()

Code

# import geopandas as gpd
# import pandas as pd
# import plotly.graph_objects as go
# from ipywidgets import interact, Dropdown

# # Read the world shapefile
# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# # Read the data containing country and genre information
# data = ac

# # Define the function to update the plot based on the selected year
# def update_plot(year):
#     # Filter the data for the selected year
#     filtered_data = data[data['year'] == year]

#     # Calculate the genre distribution by country
#     genre_distribution = filtered_data.groupby('artist_origin')['genres'].value_counts().unstack().reset_index().fillna(0)

#     # Merge the data with the world shapefile based on the country names
#     merged_data = world.merge(genre_distribution, left_on='name', right_on='artist_origin')

#     # Create a stacked bar plot using plotly
#     fig = go.Figure()

#     for genre in genre_distribution.columns[1:]:
#         fig.add_trace(go.Choropleth(
#             locations=merged_data['iso_a3'],
#             z=merged_data[genre],
#             text=merged_data['name'],
#             colorscale='Viridis',
#             autocolorscale=False,
#             marker_line_color='white',
#             marker_line_width=0.5,
#             colorbar_title='Count',
#             name=genre
#         ))

#     # Customize the layout
#     fig.update_layout(
#         title_text=f'Genre Distribution by Country in {year}',
#         height=1200,
#         width=1500,
#         barmode='stack',
#         geo=dict(
#             showframe=False,
#             showcoastlines=False,
#             projection_type='equirectangular'
#         )
#     )

#     # Show the plot
#     fig.show()

# # Get the unique years from the data
# years = data['year'].unique().tolist()

# # Use interact to create the interactive dropdown widget
# interact(update_plot, year=Dropdown(options=years))
# plt.show()