Code
# %%capture
# #!pip install squarify geopandas plotly Ipython ipywidget dash
::: {.cell _cell_guid=‘b1076dfc-b9ad-4769-8c92-a6c4dae69d19’ _uuid=‘8f2839f25d086af736a60e9eeb907d3b93b6e0e5’ tags=‘[]’ execution_count=1}
import random
import squarify
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style('white')
import geopandas as gpd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go
from IPython.display import HTML
from ipywidgets import interact, Dropdown
# from googlesearch import search
:::
(225456, 28)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 225456 entries, 0 to 225455
Data columns (total 28 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 acousticness 225456 non-null float64
1 artists 225444 non-null object
2 danceability 225456 non-null float64
3 duration_ms 225456 non-null int64
4 energy 225456 non-null float64
5 explicit 225456 non-null int64
6 id 225456 non-null object
7 instrumentalness 225456 non-null float64
8 key 225456 non-null int64
9 liveness 225456 non-null float64
10 loudness 225456 non-null float64
11 mode 225456 non-null int64
12 name 225456 non-null object
13 popularity 225456 non-null int64
14 release_date 225456 non-null object
15 speechiness 225456 non-null float64
16 tempo 225456 non-null float64
17 valence 225456 non-null float64
18 year 225456 non-null int64
19 mbid 181952 non-null object
20 artist_lastfm 177791 non-null object
21 country_mb 171605 non-null object
22 country_lastfm 132978 non-null object
23 tags_mb 142502 non-null object
24 tags_lastfm 160841 non-null object
25 listeners_lastfm 177791 non-null float64
26 scrobbles_lastfm 177791 non-null float64
27 ambiguous_artist 181952 non-null object
dtypes: float64(11), int64(6), object(11)
memory usage: 49.9+ MB
acousticness 0
artists 12
danceability 0
duration_ms 0
energy 0
explicit 0
id 0
instrumentalness 0
key 0
liveness 0
loudness 0
mode 0
name 0
popularity 0
release_date 0
speechiness 0
tempo 0
valence 0
year 0
mbid 43504
artist_lastfm 47665
country_mb 53851
country_lastfm 92478
tags_mb 82954
tags_lastfm 64615
listeners_lastfm 47665
scrobbles_lastfm 47665
ambiguous_artist 43504
dtype: int64
# Now checking the null values and dropping the rows with null values in any of the column
df_missing = data.isnull().sum(axis = 0).reset_index()
df_missing.columns = ['column_name', 'missing_count']
df_missing['missing_ratio'] = df_missing['missing_count']/data.shape[0]
df_missing.query('missing_ratio>0').sort_values(by = 'missing_ratio', ascending = False)
column_name | missing_count | missing_ratio | |
---|---|---|---|
19 | genres | 64615 | 0.286597 |
18 | artist_origin | 53851 | 0.238854 |
1 | artists | 12 | 0.000053 |
acousticness 0
artists 0
danceability 0
duration_ms 0
energy 0
explicit 0
instrumentalness 0
key 0
liveness 0
loudness 0
mode 0
name 0
popularity 0
release_date 0
speechiness 0
tempo 0
valence 0
year 0
artist_origin 0
genres 0
dtype: int64
(73887, 20)
acousticness | danceability | energy | explicit | instrumentalness | key | liveness | loudness | mode | popularity | speechiness | tempo | valence | year | duration | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 |
mean | 0.270194 | 0.572947 | 0.623204 | 0.162004 | 0.094045 | 5.261115 | 0.200284 | -8.982739 | 0.683733 | 47.005820 | 0.088800 | 120.083159 | 0.520840 | 1999.056816 | 247.873261 |
std | 0.304551 | 0.178310 | 0.244244 | 0.368457 | 0.241324 | 3.562242 | 0.175176 | 5.129790 | 0.465022 | 12.571747 | 0.101303 | 30.003903 | 0.254652 | 11.332604 | 88.925993 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -60.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1980.000000 | 30.080000 |
25% | 0.020500 | 0.456000 | 0.462500 | 0.000000 | 0.000000 | 2.000000 | 0.093100 | -11.216000 | 0.000000 | 38.000000 | 0.034500 | 96.013500 | 0.318000 | 1989.000000 | 200.095500 |
50% | 0.132000 | 0.587000 | 0.661000 | 0.000000 | 0.000024 | 5.000000 | 0.130000 | -7.688000 | 1.000000 | 46.000000 | 0.046600 | 117.956000 | 0.526000 | 1999.000000 | 236.253000 |
75% | 0.458000 | 0.705000 | 0.822000 | 0.000000 | 0.006290 | 8.000000 | 0.258000 | -5.440000 | 1.000000 | 55.000000 | 0.090500 | 139.557000 | 0.729000 | 2009.000000 | 279.933000 |
max | 0.996000 | 0.986000 | 1.000000 | 1.000000 | 0.999000 | 11.000000 | 0.998000 | 1.342000 | 1.000000 | 100.000000 | 0.960000 | 244.091000 | 0.998000 | 2020.000000 | 3816.373000 |
dtypes = {
'acousticness': 'float32',
'danceability': 'float32',
'energy': 'float32',
'explicit': 'bool',
'instrumentalness': 'float32',
'key': 'uint8',
'liveness': 'float32',
'loudness': 'float32',
'mode': 'bool',
'popularity': 'float32',
'speechiness': 'float32',
'tempo': 'float32',
'valence': 'float32',
'year': 'uint16',
'duration': 'float32',
'artists': 'string',
'name': 'string',
'release_date': 'string',
'genres': 'string',
'artist_origin': 'string'
}
df = df.astype(dtypes) #Changing the dtypes for efficient processing.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 73887 entries, 5920 to 225454
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 acousticness 73887 non-null float32
1 artists 73887 non-null string
2 danceability 73887 non-null float32
3 energy 73887 non-null float32
4 explicit 73887 non-null bool
5 instrumentalness 73887 non-null float32
6 key 73887 non-null uint8
7 liveness 73887 non-null float32
8 loudness 73887 non-null float32
9 mode 73887 non-null bool
10 name 73887 non-null string
11 popularity 73887 non-null float32
12 release_date 73887 non-null string
13 speechiness 73887 non-null float32
14 tempo 73887 non-null float32
15 valence 73887 non-null float32
16 year 73887 non-null uint16
17 artist_origin 73887 non-null string
18 genres 73887 non-null string
19 duration 73887 non-null float32
dtypes: bool(2), float32(11), string(5), uint16(1), uint8(1)
memory usage: 6.8 MB
def treemap(year):
if year != 'All':
data = df.query('year == @year')
title = f'Artist Origins Distribution for {year} - Treemap (Thresholded)'
else:
data = df
min_year = np.min(df['year'])
max_year = np.max(df['year'])
title = f'Artist Origins Distribution from {min_year} to {max_year} - Treemap (Thresholded)'
# Set the threshold count
threshold = 0.01 * data.shape[0]
# Get the counts of artist origins
artist_counts = data['artist_origin'].value_counts()
# Identify countries with counts less than the threshold
other_countries = artist_counts[artist_counts < threshold]
# Sum the counts of other countries
other_count = other_countries.sum()
# Exclude other countries from the main data
filtered_counts = artist_counts.drop(other_countries.index)
# Add the other count as a separate entry
filtered_counts['Other'] = other_count
# Calculate the total size
total_size = filtered_counts.sum()
# Create the treemap
plt.figure(figsize=(10, 6))
squarify.plot(sizes=filtered_counts, label=filtered_counts.index, alpha=0.8, color=sns.color_palette("muted"))
plt.axis('off')
# Add percentage labels
for i, patch in enumerate(plt.gca().patches):
x, y, dx, dy = patch.get_bbox().bounds
label = filtered_counts.index[i]
size = filtered_counts[label]
percentage = (size / total_size) * 100
plt.gca().text(x + dx / 2, y + dy / 2, f'\n\n{percentage:.1f}%', ha='center', va='center')
plt.title(title)
plt.savefig(title,bbox_inches = 'tight' ,dpi=150)
plt.show()
years = ['All'] + df['year'].unique().tolist()
# Create the interactive plot
interact(treemap, year=years)
plt.show()
# Read the data into a pandas DataFrame
# data = df # Replace 'your_data_file.csv' with your actual data file path
# Group the data by artist origin and count the number of artists from each region
artist_counts = df['artist_origin'].value_counts()
# Normalize the counts by dividing by the maximum count value
# normalized_counts = artist_counts / artist_counts.max()
# Create the choropleth map using plotly.graph_objects
fig = go.Figure(data=go.Choropleth(
locations=artist_counts.index,
z=artist_counts.values,
locationmode='country names',
colorscale='ylorrd',
reversescale=False, # Reverse the colorscale to have darker blue for higher counts
# zmin=0, # Set the minimum value for the color scale
# zmax=1, # Set the maximum value for the color scale
))
# Set the layout properties
fig.update_layout(
title_text='Artist Origins Count by Country from 1980 to 2020',
height = 1200,
width = 1500,
geo=dict(
showframe=False,
showcoastlines=False,
projection_type='equirectangular'
),
)
# Show the figure
fig.show()
plt.show()
def create_plot(year, country):
# Filter the data for the selected year and country
if country != 'All':
if year !='All':
filtered_data = df[(df['year'] == year) & (df['artist_origin'] == country)]
num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
# print(filtered_data.shape[0])
title = f'Top {country} Artists in Year {year}'
else:
filtered_data = df[df['artist_origin'] == country]
num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
min_year = np.min(filtered_data.year)
max_year = np.max(filtered_data.year)
# print(filtered_data.shape[0])
title = f'Top Artists from {country} across {min_year} - {max_year}'
else:
if year!='All':
filtered_data = df[df['year'] == year]
num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
# print(filtered_data.shape[0])
title = f'Top Artists from all countries in Year {year}'
else:
filtered_data = df
num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
min_year = np.min(filtered_data.year)
max_year = np.max(filtered_data.year)
# print(filtered_data.shape[0])
title = f'Top Artists from all countries across {min_year} - {max_year}'
# Group the data by artists and calculate the mean popularity per artist
grouped = filtered_data.groupby('artists')['popularity'].mean().reset_index()
# Sort the data by popularity in descending order
sorted_data = grouped.sort_values(by='popularity', ascending=False).head(5)
# Create the bar plot using Seaborn
plt.figure(figsize=(12, 8))
sns.barplot(data=sorted_data, x='artists', y='popularity', palette='muted')
plt.title(title)
plt.xlabel('Artists')
plt.ylabel('Popularity')
plt.xticks(rotation=45)
plt.yticks(range(0, int(sorted_data['popularity'].max()) + 6, 5)) # Set y-axis ticks with a fixed increment of 5
# Add artist origin labels when 'All' is selected
if country == 'All':
for i, bar in enumerate(plt.gca().patches):
artist_name = sorted_data.iloc[i, 0]
artist_origin = df.loc[df['artists'] == artist_name, 'artist_origin'].iloc[0]
plt.gca().text(bar.get_x() + bar.get_width() / 2, bar.get_height() / 2, artist_origin, ha='center', va='center', rotation=90)
plt.tight_layout()
plt.savefig(title,bbox_inches = 'tight' ,dpi=150)
plt.show()
# Get the unique years and countries in the data
years = ['All'] + df['year'].unique().tolist()
countries = sorted(['All'] + df['artist_origin'].unique().tolist())
# Create the interactive plot
interact(create_plot, year=years, country=countries)
plt.show()
def create_plot(year, country, feature, top_n=5):
# Filter the data for the selected feature then for countries and years
data_dummy = df.copy()
data_dummy = data_dummy.loc[:,['name',feature,'artist_origin','year','genres']]
if country != 'All':
if year !='All':
filtered_data = data_dummy[(data_dummy['year'] == year) & (data_dummy['artist_origin'] == country)]
num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
title = f'Top {feature.capitalize()} {country} Songs in Year {year}'
else:
filtered_data = data_dummy[data_dummy['artist_origin'] == country]
num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
min_year = np.min(filtered_data.year)
max_year = np.max(filtered_data.year)
title = f'Top {feature.capitalize()} Songs from {country} across {min_year} - {max_year}'
else:
if year!='All':
filtered_data = data_dummy[data_dummy['year'] == year]
num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
title = f'Top {feature.capitalize()} Songs from all countries in Year {year}'
else:
filtered_data = data_dummy
num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
min_year = np.min(filtered_data.year)
max_year = np.max(filtered_data.year)
title = f'Top {feature.capitalize()} Songs from all countries across {min_year} - {max_year}'
# Group the data by artists and calculate the mean popularity per artist
grouped = filtered_data.groupby(['name','genres'])[feature].mean().reset_index()
# Only taking unique song names
grouped = grouped.drop_duplicates(subset='name')
# Group the data by name of songs and calculate the mean popularity per song
sorted_data = grouped.sort_values(by=feature, ascending=False).head(top_n)
# Only using first 2 genres for songs
sorted_data.genres = sorted_data.genres.apply(lambda x: ';'.join(x.split(';')[0:2]))
# ... existing code ...
# Create the bar plot using Seaborn
plt.figure(figsize=(12, 8))
# Calculate the bar width
bar_width = 3/sorted_data.shape[0]
sns.barplot(
data=sorted_data,
x='name',
y=feature,
palette='muted',
hue='genres',
width=bar_width # Set the bar width
)
for i, bar in enumerate(plt.gca().patches):
if i < top_n:
mean_value = sorted_data.iloc[i, 2]
song_name = sorted_data.iloc[i, 0]
origin = df.loc[df['name'] == song_name, 'artist_origin'].iloc[0]
if country == 'All':
plt.gca().text(
bar.get_x() + bar.get_width() / 2,
bar.get_height() / 2,
f'{mean_value:.2f} in {origin}',
ha='center',
va='center',
rotation=90
)
else:
plt.gca().text(
bar.get_x() + bar.get_width() / 2,
bar.get_height() / 2,
f'{mean_value:.2f}',
ha='center',
va='center'
)
# ... existing code ...
# Customize the x-axis ticks
xticks = np.arange(len(sorted_data)) #+ (bar_width/2.7) # Add half of the bar width
plt.xticks(xticks, sorted_data['name'], rotation=36)
plt.gca().xaxis.set_tick_params(width=1) # Set the tick width
# ... existing code ...
plt.xlabel('Name of Songs')
plt.ylabel(f'{feature.capitalize()}')
plt.title(title)
plt.tight_layout()
plt.savefig(title, bbox_inches='tight', dpi=150)
plt.show()
# Get the unique years and countries in the data
years = ['All'] + df['year'].unique().tolist()
countries = sorted(['All'] + df['artist_origin'].unique().tolist())
features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']
tops = [5,10,15]
# Removed Explicit: Whether or not the track has explicit lyrics (true = yes it does; false = no it does not OR unknown); Not that important
# Removed Year as it's not a feature for a song
# Create the interactive plot
interact(create_plot, year=years,country=countries, feature = features, top_n=tops)
plt.show()
features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']
# Define a function to update the plot based on the selected feature
def update_plot(feature):
# Group the data by artist origin and calculate the mean of the selected feature per country
feature_means = df.groupby('artist_origin')[feature].mean()
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# Merge the data with the world shapefile based on the country names
merged_data = world.merge(feature_means, left_on='name', right_index=True)
# Create a choropleth map using plotly
fig = go.Figure(data=go.Choropleth(
locations=merged_data['iso_a3'],
z=merged_data[feature],
text=merged_data['name'],
colorscale='Viridis',
autocolorscale=False,
marker_line_color='white',
marker_line_width=0.5,
colorbar_title=f'{feature.capitalize()}'
))
# Customize the layout
fig.update_layout(
title_text=f'{feature.capitalize()} Distribution by Country',
height = 1200,
width = 1500,
geo=dict(
showframe=False,
showcoastlines=False,
projection_type='equirectangular'
)
)
# Show the plot
fig.show()
features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']
# Use interact to create the interactive dropdown widget
interact(update_plot, feature=Dropdown(options=features))
plt.show()
# The distribution of features over the years by aggregating them detailed of this is the plot below this
# Define the list of features
features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
'tempo', 'valence']
# Function to create the interactive histogram plot
def plot_histogram(feature):
# Create the histogram plot using seaborn
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x=feature, bins=30, kde=True, hue='year', multiple='stack', palette='viridis')
plt.title(f'{feature.capitalize()} Distribution by Year')
plt.xlabel('Mean Value')
plt.ylabel('Frequency')
plt.legend(title='Year')
plt.show()
# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
plt.show()
# Read the data into a pandas DataFrame
# data = df # Replace 'your_data_file.csv' with your actual data file path
# Define the list of features
features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
'tempo', 'valence']
# Function to create the interactive histogram plot
def plot_histogram(feature):
# Create a subplot for each year
fig, ax = plt.subplots(figsize=(24, 9), dpi = 120)
# Iterate over each year and create the histogram for the selected feature
for year, group in df.groupby('year'):
sns.histplot(data=group, x=feature, bins=30, kde=True, ax=ax, label=str(year))
# Set the plot title and labels
ax.set_title(f'{feature.capitalize()} Distribution by Year')
ax.set_xlabel('Mean Value')
ax.set_ylabel('Frequency')
# Set the legend inside the plot with 8 columns
legend = ax.legend(title='Year', bbox_to_anchor=(1, 1), ncol=8)
# Adjust the plot to accommodate the legend
plt.subplots_adjust(right=0.85)
# Show the plot
plt.show()
# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
plt.show()
# Read the data into a pandas DataFrame
# data = df # Replace 'your_data_file.csv' with your actual data file path
# Define the list of features
features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
'tempo', 'valence']
# Define the number of light colors to select
num_colors = len(features)
desired_colors = ['tab:purple', 'tab:orange', 'tab:green', 'tab:blue']
# Function to create the interactive histogram plot
def plot_histogram(feature):
# Select random light colors from 'tab10' palette
# palette = random.sample(sns.color_palette('tab10', 10)[:5], 1)
# Group the data by year and calculate the mean of the selected feature per year
feature_means = df.groupby('year')[feature].mean()
# Select a random light color from the palette
color = random.choice(desired_colors)
fig, ax = plt.subplots(1,1, figsize = (20,9),dpi = 120)
# Create the bar plot for the means using seaborn
ax = sns.barplot(x=feature_means.index, y=feature_means, color=color)
# Set the plot title and labels
ax.set_title(f'Mean {feature.capitalize()} by Year')
ax.set_xlabel('Year')
ax.set_ylabel('Mean Value')
# Rotate and evenly space the year labels on the x-axis
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.xaxis.set_tick_params(pad=10)
# Show the plot
plt.tight_layout()
plt.show()
# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
plt.show()
# import geopandas as gpd
# import pandas as pd
# import plotly.graph_objects as go
# from ipywidgets import interact, Dropdown
# # Read the world shapefile
# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# # Read the data containing country and genre information
# data = ac
# # Define the function to update the plot based on the selected year
# def update_plot(year):
# # Filter the data for the selected year
# filtered_data = data[data['year'] == year]
# # Calculate the genre distribution by country
# genre_distribution = filtered_data.groupby('artist_origin')['genres'].value_counts().unstack().reset_index().fillna(0)
# # Merge the data with the world shapefile based on the country names
# merged_data = world.merge(genre_distribution, left_on='name', right_on='artist_origin')
# # Create a stacked bar plot using plotly
# fig = go.Figure()
# for genre in genre_distribution.columns[1:]:
# fig.add_trace(go.Choropleth(
# locations=merged_data['iso_a3'],
# z=merged_data[genre],
# text=merged_data['name'],
# colorscale='Viridis',
# autocolorscale=False,
# marker_line_color='white',
# marker_line_width=0.5,
# colorbar_title='Count',
# name=genre
# ))
# # Customize the layout
# fig.update_layout(
# title_text=f'Genre Distribution by Country in {year}',
# height=1200,
# width=1500,
# barmode='stack',
# geo=dict(
# showframe=False,
# showcoastlines=False,
# projection_type='equirectangular'
# )
# )
# # Show the plot
# fig.show()
# # Get the unique years from the data
# years = data['year'].unique().tolist()
# # Use interact to create the interactive dropdown widget
# interact(update_plot, year=Dropdown(options=years))
# plt.show()