library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(RSQLite)
library(DBI)
db <- 'chinook.db'
run_query <- function(x){
conn = dbConnect(SQLite(),db) #local environment, `RSQlite()`
result <- dbGetQuery(conn, x)
dbDisconnect(conn)
return (result)
}
show_table <- function(){
q <- 'SELECT name, type
FROM sqlite_master
WHERE type IN ("table","view");'
return (run_query(q))
}
run show_table
show_table()
## name type
## 1 album table
## 2 artist table
## 3 customer table
## 4 employee table
## 5 genre table
## 6 invoice table
## 7 invoice_line table
## 8 media_type table
## 9 playlist table
## 10 playlist_track table
## 11 track table
Sum of total USA quantity
q <- 'SELECT SUM(il.quantity) total_quantity
FROM invoice_line il
LEFT JOIN invoice i ON i.invoice_id = il.invoice_id
LEFT JOIN customer c ON i.customer_id = c.customer_id
WHERE c.country = "USA"
'
run_query(q)
## total_quantity
## 1 1051
Build up query
usa_sales <- 'SELECT g.name genre,
SUM(il.quantity) quantity,
CAST(SUM(il.quantity) AS FLOAT)/(
SELECT SUM(il.quantity) total_quantity
FROM invoice_line il
LEFT JOIN invoice i ON i.invoice_id = il.invoice_id
LEFT JOIN customer c ON i.customer_id = c.customer_id
WHERE c.country = "USA"
) precentage
FROM invoice_line il
LEFT JOIN invoice i ON i.invoice_id = il.invoice_id
LEFT JOIN customer c ON i.customer_id = c.customer_id
LEFT JOIN track t ON il.track_id = t.track_id
LEFT JOIN genre g ON g.genre_id = t.genre_id
WHERE c.country = "USA"
GROUP BY g.name'
run_query(usa_sales)
## genre quantity precentage
## 1 Alternative 35 0.0333016175
## 2 Alternative & Punk 130 0.1236917222
## 3 Blues 36 0.0342530923
## 4 Classical 4 0.0038058991
## 5 Easy Listening 13 0.0123691722
## 6 Electronica/Dance 5 0.0047573739
## 7 Heavy Metal 3 0.0028544244
## 8 Hip Hop/Rap 20 0.0190294957
## 9 Jazz 14 0.0133206470
## 10 Latin 22 0.0209324453
## 11 Metal 124 0.1179828735
## 12 Pop 22 0.0209324453
## 13 R&B/Soul 53 0.0504281637
## 14 Reggae 6 0.0057088487
## 15 Rock 561 0.5337773549
## 16 Soundtrack 2 0.0019029496
## 17 TV Shows 1 0.0009514748
Visualization
run_query(usa_sales) %>%
ggplot(aes(x = genre, y= quantity))+
geom_col()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
run_query(usa_sales) %>%
ggplot(aes(x = '', y= precentage, fill = genre))+
geom_col()+
coord_polar(theta = "y")
albums_to_purchase = '
WITH usa_tracks_sold AS
(
SELECT il.* FROM invoice_line il
INNER JOIN invoice i on il.invoice_id = i.invoice_id
INNER JOIN customer c on i.customer_id = c.customer_id
WHERE c.country = "USA"
)
SELECT
g.name genre,
count(uts.invoice_line_id) tracks_sold,
cast(count(uts.invoice_line_id) AS FLOAT) / (
SELECT COUNT(*) from usa_tracks_sold
) percentage_sold
FROM usa_tracks_sold uts
INNER JOIN track t on t.track_id = uts.track_id
INNER JOIN genre g on g.genre_id = t.genre_id
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10;
'
run_query(albums_to_purchase)
## genre tracks_sold percentage_sold
## 1 Rock 561 0.53377735
## 2 Alternative & Punk 130 0.12369172
## 3 Metal 124 0.11798287
## 4 R&B/Soul 53 0.05042816
## 5 Blues 36 0.03425309
## 6 Alternative 35 0.03330162
## 7 Pop 22 0.02093245
## 8 Latin 22 0.02093245
## 9 Hip Hop/Rap 20 0.01902950
## 10 Jazz 14 0.01332065
genre_sales = run_query(albums_to_purchase)
ggplot(data = genre_sales, aes(x = reorder(genre, -percentage_sold),
y = percentage_sold)) +
geom_bar(stat = "identity")
Among the genres represented in our list of 4 albums, punk, blues and pop are the highest rated. Therefore, we should recommend:
By far though, rock makes up the majority of the sales. To better capture sales in the USA, we might want to ask the record label if they have any up-and-coming rock bands.
support agent’s performance on total sales, quantity, invoice, clients
employee_performance <- '
SELECT e.first_name||" "||e.last_name support_agent,
SUM(iL.unit_price*il.quantity) total_sales,
SUM(il.quantity) track_quantity,
COUNT(DISTINCT i.invoice_id) invoice_quantity,
COUNT(DISTINCT c.customer_id) clients
FROM invoice_line il
LEFT JOIN invoice i ON i.invoice_id = il.invoice_id
LEFT JOIN customer c ON c.customer_id = i.customer_id
LEFT JOIN employee e ON e.employee_id = c. support_rep_id
GROUP BY 1
ORDER BY 1 DESC
'
run_query(employee_performance) %>%
pivot_longer(cols = 2:5,
names_to = 'KPI',
values_to = 'values'
) %>%
ggplot(aes(x = support_agent, y = values,))+
geom_col()+
facet_wrap(vars(KPI),
scale = 'free_y'
)
It shows Jane Peacock has the best sales, sold most quantity of the track, gain the most clients. Margaret Park stands the second place, while makes the most transactions in the team. We can further anylyze the average track per client, average sales per invoice as KPI.
employee_sales_performance = '
WITH customer_support_rep_sales AS
(
SELECT
i.customer_id,
c.support_rep_id,
SUM(i.total) total
FROM invoice i
INNER JOIN customer c ON i.customer_id = c.customer_id
GROUP BY 1,2
)
SELECT
e.first_name || " " || e.last_name employee,
e.hire_date,
SUM(csrs.total) total_sales
FROM customer_support_rep_sales csrs
INNER JOIN employee e ON e.employee_id = csrs.support_rep_id
GROUP BY 1;
'
run_query(employee_sales_performance)
## employee hire_date total_sales
## 1 Jane Peacock 2017-04-01 00:00:00 1731.51
## 2 Margaret Park 2017-05-03 00:00:00 1584.00
## 3 Steve Johnson 2017-10-17 00:00:00 1393.92
employee_sales = run_query(employee_sales_performance)
ggplot(data = employee_sales, aes(x = reorder(employee, -total_sales),
y = total_sales)) +
geom_bar(stat = "identity")
Jane Peacock has the highest amount of sales, but she also has been at the company the longest. If we really want to hone in on employee efficiency, we might want to standardize sales by the number of days or hours worked.
Query that collates data on purchases from different countries, where a country has only one customer, collect them into an “Other” group.The results is sorted by the total sales from highest to lowest, with the “Other” group at the very bottom.
country_sales <- '
WITH country_sales AS
(
SELECT CASE
WHEN COUNT(DISTINCT i.customer_id) = 1 THEN "Other"
ELSE c.country
END AS country_new,
SUM(il.unit_price * il.quantity) total_sales,
SUM(il.quantity) total_quantity,
COUNT(DISTINCT il.invoice_id) total_invoice,
COUNT(DISTINCT i.customer_id) total_customer
FROM invoice_line il
LEFT JOIN invoice i ON il.invoice_id = i.invoice_id
LEFT JOIN customer c ON c.customer_id = i.customer_id
GROUP BY c.country
)
SELECT country_new country,
SUM(total_customer) customer,
SUM(total_sales) sales,
SUM(total_sales) / SUM(total_customer) sales_per_customer,
SUM(total_sales) / SUM(total_invoice) sales_per_order,
CASE
WHEN total_quantity = 1 THEN 1
ELSE 0
END AS sort
FROM country_sales
GROUP BY country_new
ORDER BY SORT ASC, total_sales DESC;
'
run_query(country_sales)
## country customer sales sales_per_customer sales_per_order sort
## 1 USA 13 1040.49 80.03769 7.942672 0
## 2 Canada 8 535.59 66.94875 7.047237 0
## 3 Brazil 5 427.68 85.53600 7.011148 0
## 4 France 5 389.07 77.81400 7.781400 0
## 5 Germany 4 334.62 83.65500 8.161463 0
## 6 Czech Republic 2 273.24 136.62000 9.108000 0
## 7 United Kingdom 3 245.52 81.84000 8.768571 0
## 8 Portugal 2 185.13 92.56500 6.383793 0
## 9 India 2 183.15 91.57500 8.721429 0
## 10 Other 15 1094.94 72.99600 7.448571 0
Let’s put them into a plot!
country_analysis <- run_query(country_sales)
country_analysis %>%
ggplot(aes(x= reorder(country, -customer) , y=customer, fill = country))+
geom_col()+
labs(
title = "Number of customers by country",
x = "Country",
y = "Customers"
)+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
country_analysis %>%
ggplot(aes(x= reorder(country, -sales) , y=sales, fill = country))+
geom_col()+
coord_polar('y')+
labs(
title = "Total sales by country",
x = "Country",
y = "Total Sales"
)
country_analysis %>%
ggplot(aes(x= reorder(country, -sales_per_customer) , y=sales_per_customer, color = country))+
geom_point()+
labs(
title = "Customer average sales by country",
x = "Country",
y = "Customer Lifetime Value"
)+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Tried different ways to plot and found three interesting point to uplift the future sales:
sales_by_country = '
WITH country_or_other AS
(
SELECT
CASE
WHEN (
SELECT count(*)
FROM customer
where country = c.country
) = 1 THEN "Other"
ELSE c.country
END AS country,
c.customer_id,
il.*
FROM invoice_line il
INNER JOIN invoice i ON i.invoice_id = il.invoice_id
INNER JOIN customer c ON c.customer_id = i.customer_id
)
SELECT
country,
customers,
total_sales,
average_order,
customer_lifetime_value
FROM
(
SELECT
country,
count(distinct customer_id) customers,
SUM(unit_price) total_sales,
SUM(unit_price) / count(distinct customer_id) customer_lifetime_value,
SUM(unit_price) / count(distinct invoice_id) average_order,
CASE
WHEN country = "Other" THEN 1
ELSE 0
END AS sort
FROM country_or_other
GROUP BY country
ORDER BY sort ASC, total_sales DESC
);
'
run_query(sales_by_country)
## country customers total_sales average_order customer_lifetime_value
## 1 USA 13 1040.49 7.942672 80.03769
## 2 Canada 8 535.59 7.047237 66.94875
## 3 Brazil 5 427.68 7.011148 85.53600
## 4 France 5 389.07 7.781400 77.81400
## 5 Germany 4 334.62 8.161463 83.65500
## 6 Czech Republic 2 273.24 9.108000 136.62000
## 7 United Kingdom 3 245.52 8.768571 81.84000
## 8 Portugal 2 185.13 6.383793 92.56500
## 9 India 2 183.15 8.721429 91.57500
## 10 Other 15 1094.94 7.448571 72.99600
Query that categorizes each invoice as either an album purchase or not, and calculates the following summary statistics:
album_vs_track <- '
WITH buytrack AS (
SELECT il.invoice_id invoice_id,
CASE
WHEN (
SELECT track_id
FROM invoice_line
WHERE invoice_id = il.invoice_id
EXCEPT
SELECT track_id
FROM track
WHERE album_id = t.album_id
) IS NULL
AND (
SELECT track_id
FROM track
WHERE album_id = t.album_id
EXCEPT
SELECT track_id
FROM invoice_line
WHERE invoice_id = il.invoice_id
) IS NULL
THEN "No"
ELSE "YES"
END AS if_track
FROM invoice_line il
LEFT JOIN track t ON il.track_id = t.track_id
)
SELECT if_track,
COUNT(DISTINCT invoice_id) number_of_invoice,
COUNT(DISTINCT invoice_id)
/CAST(
(SELECT COUNT(invoice_id) FROM invoice)
AS FLOAT
) percentage
FROM buytrack
GROUP BY if_track
'
run_query(album_vs_track)
## if_track number_of_invoice percentage
## 1 No 114 0.1856678
## 2 YES 500 0.8143322
There are more than 80% of the purchase is by track from the sales report. It is good idea to change the strategy to purchase only the most popular tracks from each album from record companies, instead of purchasing every track from an album.
albums_vs_tracks = '
WITH invoice_first_track AS
(
SELECT
il.invoice_id invoice_id,
MIN(il.track_id) first_track_id
FROM invoice_line il
GROUP BY 1
)
SELECT
album_purchase,
COUNT(invoice_id) number_of_invoices,
CAST(count(invoice_id) AS FLOAT) / (
SELECT COUNT(*) FROM invoice
) percent
FROM
(
SELECT
ifs.*,
CASE
WHEN
(
SELECT t.track_id FROM track t
WHERE t.album_id = (
SELECT t2.album_id FROM track t2
WHERE t2.track_id = ifs.first_track_id
)
EXCEPT
SELECT il2.track_id FROM invoice_line il2
WHERE il2.invoice_id = ifs.invoice_id
) IS NULL
AND
(
SELECT il2.track_id FROM invoice_line il2
WHERE il2.invoice_id = ifs.invoice_id
EXCEPT
SELECT t.track_id FROM track t
WHERE t.album_id = (
SELECT t2.album_id FROM track t2
WHERE t2.track_id = ifs.first_track_id
)
) IS NULL
THEN "yes"
ELSE "no"
END AS "album_purchase"
FROM invoice_first_track ifs
)
GROUP BY album_purchase;
'
run_query(albums_vs_tracks)
## album_purchase number_of_invoices percent
## 1 no 500 0.8143322
## 2 yes 114 0.1856678
#remaining question for future exploration