Load library

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(ggplot2)
library(RSQLite)
library(DBI)

Creating Helper Functions

db <- 'chinook.db'

run_query <- function(x){
  conn = dbConnect(SQLite(),db)  #local environment, `RSQlite()`
  result <- dbGetQuery(conn, x)
  dbDisconnect(conn)
  return (result)
}

show_table <- function(){
  q <- 'SELECT name, type
        FROM sqlite_master
        WHERE type IN ("table","view");'
  return (run_query(q))
}

run show_table

show_table()

##              name  type
## 1           album table
## 2          artist table
## 3        customer table
## 4        employee table
## 5           genre table
## 6         invoice table
## 7    invoice_line table
## 8      media_type table
## 9        playlist table
## 10 playlist_track table
## 11          track table

the best genres in the USA

Sum of total USA quantity

q <-  'SELECT SUM(il.quantity) total_quantity
       FROM invoice_line il
       LEFT JOIN invoice i ON i.invoice_id = il.invoice_id
       LEFT JOIN customer c ON i.customer_id = c.customer_id
       WHERE c.country = "USA" 
       '
run_query(q)

##   total_quantity
## 1           1051

Build up query

usa_sales <- 'SELECT g.name genre,
                      SUM(il.quantity) quantity,
                      CAST(SUM(il.quantity) AS FLOAT)/(
                                      SELECT SUM(il.quantity) total_quantity
                                      FROM invoice_line il
                                      LEFT JOIN invoice i ON i.invoice_id = il.invoice_id
                                      LEFT JOIN customer c ON i.customer_id = c.customer_id
                                      WHERE c.country = "USA"
                                      ) precentage
              FROM invoice_line il
              LEFT JOIN invoice i ON i.invoice_id = il.invoice_id
              LEFT JOIN customer c ON i.customer_id = c.customer_id
              LEFT JOIN track t ON il.track_id = t.track_id
              LEFT JOIN genre g ON g.genre_id = t.genre_id
              WHERE c.country = "USA" 
              GROUP BY g.name'
              
run_query(usa_sales)

##                 genre quantity   precentage
## 1         Alternative       35 0.0333016175
## 2  Alternative & Punk      130 0.1236917222
## 3               Blues       36 0.0342530923
## 4           Classical        4 0.0038058991
## 5      Easy Listening       13 0.0123691722
## 6   Electronica/Dance        5 0.0047573739
## 7         Heavy Metal        3 0.0028544244
## 8         Hip Hop/Rap       20 0.0190294957
## 9                Jazz       14 0.0133206470
## 10              Latin       22 0.0209324453
## 11              Metal      124 0.1179828735
## 12                Pop       22 0.0209324453
## 13           R&B/Soul       53 0.0504281637
## 14             Reggae        6 0.0057088487
## 15               Rock      561 0.5337773549
## 16         Soundtrack        2 0.0019029496
## 17           TV Shows        1 0.0009514748

Visualization

run_query(usa_sales) %>%
  ggplot(aes(x = genre, y= quantity))+
         geom_col()+
         theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

run_query(usa_sales) %>%
  ggplot(aes(x = '', y= precentage, fill = genre))+
         geom_col()+
         coord_polar(theta = "y")

answer from the course

albums_to_purchase = '
WITH usa_tracks_sold AS
   (
    SELECT il.* FROM invoice_line il
    INNER JOIN invoice i on il.invoice_id = i.invoice_id
    INNER JOIN customer c on i.customer_id = c.customer_id
    WHERE c.country = "USA"
   )
SELECT
    g.name genre,
    count(uts.invoice_line_id) tracks_sold,
    cast(count(uts.invoice_line_id) AS FLOAT) / (
        SELECT COUNT(*) from usa_tracks_sold
    ) percentage_sold
FROM usa_tracks_sold uts
INNER JOIN track t on t.track_id = uts.track_id
INNER JOIN genre g on g.genre_id = t.genre_id
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10;
'
run_query(albums_to_purchase)

##                 genre tracks_sold percentage_sold
## 1                Rock         561      0.53377735
## 2  Alternative & Punk         130      0.12369172
## 3               Metal         124      0.11798287
## 4            R&B/Soul          53      0.05042816
## 5               Blues          36      0.03425309
## 6         Alternative          35      0.03330162
## 7                 Pop          22      0.02093245
## 8               Latin          22      0.02093245
## 9         Hip Hop/Rap          20      0.01902950
## 10               Jazz          14      0.01332065

genre_sales = run_query(albums_to_purchase)
ggplot(data = genre_sales, aes(x = reorder(genre, -percentage_sold), 
                               y = percentage_sold)) +
  geom_bar(stat = "identity")

Among the genres represented in our list of 4 albums, punk, blues and pop are the highest rated. Therefore, we should recommend:

Red Tone (Punk)
Slim Jim Bites (Blues)
Meteor and the Girls (Pop)

By far though, rock makes up the majority of the sales. To better capture sales in the USA, we might want to ask the record label if they have any up-and-coming rock bands.

Analyzing Employee Sales Performance

support agent’s performance on total sales, quantity, invoice, clients

employee_performance <- '
    SELECT e.first_name||" "||e.last_name support_agent,
           SUM(iL.unit_price*il.quantity) total_sales,
           SUM(il.quantity) track_quantity,
           COUNT(DISTINCT i.invoice_id) invoice_quantity,
           COUNT(DISTINCT c.customer_id) clients
    FROM invoice_line il
    LEFT JOIN invoice i ON i.invoice_id = il.invoice_id
    LEFT JOIN customer c ON c.customer_id = i.customer_id
    LEFT JOIN employee e ON e.employee_id = c. support_rep_id
    GROUP BY 1
    ORDER BY 1 DESC
    '

run_query(employee_performance) %>%
  pivot_longer(cols = 2:5,
               names_to = 'KPI',
               values_to = 'values'
               ) %>% 
ggplot(aes(x = support_agent, y = values,))+
  geom_col()+
  facet_wrap(vars(KPI),
             scale = 'free_y'
             )

It shows Jane Peacock has the best sales, sold most quantity of the track, gain the most clients. Margaret Park stands the second place, while makes the most transactions in the team. We can further anylyze the average track per client, average sales per invoice as KPI.

answer from the course

employee_sales_performance = '
WITH customer_support_rep_sales AS
    (
     SELECT
         i.customer_id,
         c.support_rep_id,
         SUM(i.total) total
     FROM invoice i
     INNER JOIN customer c ON i.customer_id = c.customer_id
     GROUP BY 1,2
    )
SELECT
    e.first_name || " " || e.last_name employee,
    e.hire_date,
    SUM(csrs.total) total_sales
FROM customer_support_rep_sales csrs
INNER JOIN employee e ON e.employee_id = csrs.support_rep_id
GROUP BY 1;
'
run_query(employee_sales_performance)

##        employee           hire_date total_sales
## 1  Jane Peacock 2017-04-01 00:00:00     1731.51
## 2 Margaret Park 2017-05-03 00:00:00     1584.00
## 3 Steve Johnson 2017-10-17 00:00:00     1393.92

employee_sales = run_query(employee_sales_performance)
ggplot(data = employee_sales, aes(x = reorder(employee, -total_sales), 
                               y = total_sales)) +
  geom_bar(stat = "identity")

Jane Peacock has the highest amount of sales, but she also has been at the company the longest. If we really want to hone in on employee efficiency, we might want to standardize sales by the number of days or hours worked.

Visualizing Sales by Country

Query that collates data on purchases from different countries, where a country has only one customer, collect them into an “Other” group.The results is sorted by the total sales from highest to lowest, with the “Other” group at the very bottom.

country_sales <- '
  WITH country_sales AS 
      (
      SELECT CASE
                WHEN COUNT(DISTINCT i.customer_id) = 1 THEN "Other"
                ELSE c.country
             END AS country_new,
             SUM(il.unit_price * il.quantity) total_sales,
             SUM(il.quantity) total_quantity,
             COUNT(DISTINCT il.invoice_id) total_invoice,
             COUNT(DISTINCT i.customer_id) total_customer
      FROM invoice_line il
      LEFT JOIN invoice i ON il.invoice_id = i.invoice_id
      LEFT JOIN customer c ON c.customer_id = i.customer_id
      GROUP BY c.country
      )
  SELECT country_new country,
         SUM(total_customer) customer,
         SUM(total_sales) sales,
         SUM(total_sales) / SUM(total_customer) sales_per_customer,
         SUM(total_sales) / SUM(total_invoice) sales_per_order,
         CASE
            WHEN total_quantity = 1 THEN 1
            ELSE 0
         END AS sort
  FROM country_sales
  GROUP BY country_new
  ORDER BY SORT ASC, total_sales DESC;
  '
 run_query(country_sales)

##           country customer   sales sales_per_customer sales_per_order sort
## 1             USA       13 1040.49           80.03769        7.942672    0
## 2          Canada        8  535.59           66.94875        7.047237    0
## 3          Brazil        5  427.68           85.53600        7.011148    0
## 4          France        5  389.07           77.81400        7.781400    0
## 5         Germany        4  334.62           83.65500        8.161463    0
## 6  Czech Republic        2  273.24          136.62000        9.108000    0
## 7  United Kingdom        3  245.52           81.84000        8.768571    0
## 8        Portugal        2  185.13           92.56500        6.383793    0
## 9           India        2  183.15           91.57500        8.721429    0
## 10          Other       15 1094.94           72.99600        7.448571    0

Let’s put them into a plot!

country_analysis <-  run_query(country_sales) 

country_analysis %>%
  ggplot(aes(x= reorder(country, -customer) , y=customer, fill = country))+
  geom_col()+
  labs(
    title = "Number of customers by country",
    x = "Country",
    y = "Customers"
  )+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

country_analysis %>%
  ggplot(aes(x= reorder(country, -sales) , y=sales, fill = country))+
  geom_col()+
  coord_polar('y')+
  labs(
    title = "Total sales by country",
    x = "Country",
    y = "Total Sales"
  )

country_analysis %>%
  ggplot(aes(x= reorder(country, -sales_per_customer) , y=sales_per_customer, color = country))+
  geom_point()+
  labs(
    title = "Customer average sales by country",
    x = "Country",
    y = "Customer Lifetime Value"
  )+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Tried different ways to plot and found three interesting point to uplift the future sales:

USA contributes the most sales from the highest number of the customers, it is always the biggest market to prioritize the resources in order to increase the sales.
Canada stands the second place for the sales with the second most of the customers. However the average customer purchasing value is the lowest. It is a good potential improve the sales by encourage individual customer to buy more.
Czech Republic gets the highest average customer purchase. This a great potential market to explore, because they love our products. We need to focus on get more people to know our business.

answer from the course

sales_by_country = '
WITH country_or_other AS
    (
     SELECT
       CASE
           WHEN (
                 SELECT count(*)
                 FROM customer
                 where country = c.country
                ) = 1 THEN "Other"
           ELSE c.country
       END AS country,
       c.customer_id,
       il.*
     FROM invoice_line il
     INNER JOIN invoice i ON i.invoice_id = il.invoice_id
     INNER JOIN customer c ON c.customer_id = i.customer_id
    )
SELECT
    country,
    customers,
    total_sales,
    average_order,
    customer_lifetime_value
FROM
    (
    SELECT
        country,
        count(distinct customer_id) customers,
        SUM(unit_price) total_sales,
        SUM(unit_price) / count(distinct customer_id) customer_lifetime_value,
        SUM(unit_price) / count(distinct invoice_id) average_order,
        CASE
            WHEN country = "Other" THEN 1
            ELSE 0
        END AS sort
    FROM country_or_other
    GROUP BY country
    ORDER BY sort ASC, total_sales DESC
    );
'
run_query(sales_by_country)

##           country customers total_sales average_order customer_lifetime_value
## 1             USA        13     1040.49      7.942672                80.03769
## 2          Canada         8      535.59      7.047237                66.94875
## 3          Brazil         5      427.68      7.011148                85.53600
## 4          France         5      389.07      7.781400                77.81400
## 5         Germany         4      334.62      8.161463                83.65500
## 6  Czech Republic         2      273.24      9.108000               136.62000
## 7  United Kingdom         3      245.52      8.768571                81.84000
## 8        Portugal         2      185.13      6.383793                92.56500
## 9           India         2      183.15      8.721429                91.57500
## 10          Other        15     1094.94      7.448571                72.99600

Albums vs. Individual Tracks

Query that categorizes each invoice as either an album purchase or not, and calculates the following summary statistics:

Number of invoices
Percentage of invoices

album_vs_track <- '
    WITH buytrack AS (
                     SELECT il.invoice_id invoice_id,
                            CASE
                              WHEN (
                                SELECT track_id 
                                FROM invoice_line
                                WHERE invoice_id = il.invoice_id
                                EXCEPT
                                SELECT track_id
                                FROM  track 
                                WHERE album_id = t.album_id
                              ) IS NULL
                              AND (
                               SELECT track_id
                                FROM  track 
                                WHERE album_id = t.album_id
                                EXCEPT
                               SELECT track_id 
                                FROM invoice_line
                                WHERE invoice_id = il.invoice_id
                              ) IS NULL
                                THEN "No"
                              ELSE "YES"
                              END AS if_track
                     FROM invoice_line il
                     LEFT JOIN track t ON il.track_id = t.track_id
              
    )
  SELECT  if_track,
          COUNT(DISTINCT invoice_id) number_of_invoice,
          COUNT(DISTINCT invoice_id)
          /CAST(
            (SELECT COUNT(invoice_id) FROM invoice) 
            AS FLOAT
            ) percentage
  FROM buytrack
  GROUP BY if_track
          
'
run_query(album_vs_track)

##   if_track number_of_invoice percentage
## 1       No               114  0.1856678
## 2      YES               500  0.8143322

There are more than 80% of the purchase is by track from the sales report. It is good idea to change the strategy to purchase only the most popular tracks from each album from record companies, instead of purchasing every track from an album.

answer from the course

albums_vs_tracks = '
WITH invoice_first_track AS
    (
     SELECT
         il.invoice_id invoice_id,
         MIN(il.track_id) first_track_id
     FROM invoice_line il
     GROUP BY 1
    )
SELECT
    album_purchase,
    COUNT(invoice_id) number_of_invoices,
    CAST(count(invoice_id) AS FLOAT) / (
                                         SELECT COUNT(*) FROM invoice
                                      ) percent
FROM
    (
    SELECT
        ifs.*,
        CASE
            WHEN
                 (
                  SELECT t.track_id FROM track t
                  WHERE t.album_id = (
                                      SELECT t2.album_id FROM track t2
                                      WHERE t2.track_id = ifs.first_track_id
                                     ) 
                  EXCEPT 
                  SELECT il2.track_id FROM invoice_line il2
                  WHERE il2.invoice_id = ifs.invoice_id
                 ) IS NULL
             AND
                 (
                  SELECT il2.track_id FROM invoice_line il2
                  WHERE il2.invoice_id = ifs.invoice_id
                  EXCEPT 
                  SELECT t.track_id FROM track t
                  WHERE t.album_id = (
                                      SELECT t2.album_id FROM track t2
                                      WHERE t2.track_id = ifs.first_track_id
                                     ) 
                 ) IS NULL
             THEN "yes"
             ELSE "no"
         END AS "album_purchase"
     FROM invoice_first_track ifs
    )
GROUP BY album_purchase;
'
run_query(albums_vs_tracks)

##   album_purchase number_of_invoices   percent
## 1             no                500 0.8143322
## 2            yes                114 0.1856678

#remaining question for future exploration

Which artist is used in the most playlists?
How many tracks have been purchased vs not purchased?
Is the range of tracks in the store reflective of their sales popularity?
Do protected vs. non-protected media types have an effect on popularity?

Answering Business Questions using SQL (Intermediate SQL in R): Guided Project Solutions

Zhiwen Shi

2022-10-15

Load library

Creating Helper Functions

the best genres in the USA

answer from the course

Analyzing Employee Sales Performance

answer from the course

Visualizing Sales by Country

answer from the course

Albums vs. Individual Tracks

answer from the course