import sqlite3
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
import os
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] =  (22,5)
plt.rcParams['font.size'] = 14
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False

!pip install kaggle

Requirement already satisfied: kaggle in /usr/local/lib/python3.10/dist-packages (1.6.17)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.10/dist-packages (from kaggle) (1.17.0)
Requirement already satisfied: certifi>=2023.7.22 in /usr/local/lib/python3.10/dist-packages (from kaggle) (2024.12.14)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.8.2)
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.32.3)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from kaggle) (4.67.1)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.10/dist-packages (from kaggle) (8.0.4)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.2.3)
Requirement already satisfied: bleach in /usr/local/lib/python3.10/dist-packages (from kaggle) (6.2.0)
Requirement already satisfied: webencodings in /usr/local/lib/python3.10/dist-packages (from bleach->kaggle) (0.5.1)
Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.10/dist-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (3.4.0)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (3.10)

# Create the Kaggle directory and move the kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# Set the permissions for the Kaggle API file
!chmod 600 ~/.kaggle/kaggle.json

import kagglehub

# Download the Sakila SQLite Sample Database
dataset_path = kagglehub.dataset_download('atanaskanev/sqlite-sakila-sample-database')

# Print the path to confirm download
print('Dataset downloaded to:', dataset_path)

import os

# Check the contents of the dataset directory
print(os.listdir(dataset_path))

['sqlite-sakila-db', 'README.txt', 'SQLite3 Sakila Sample Database ERD.png', 'sqlite-sakila.db']

from IPython.display import Image, display
import os

# Full path to the ERD image
er_diagram = os.path.join(dataset_path, 'SQLite3 Sakila Sample Database ERD.png')

# Display the image
display(Image(filename=er_diagram))

import sqlite3
import pandas as pd
import os

# Use the correct path to the SQLite database file
database_path = os.path.join(dataset_path, 'sqlite-sakila.db')

# Connect to the database
conn = sqlite3.connect(database_path)

# Query the sqlite_master table to list all tables
query = '''
SELECT name, type
FROM sqlite_master
WHERE type = 'table'
'''
master_file = pd.read_sql(query, conn)

# Extract the table names
tables = master_file['name'].values.tolist()

# Display the tables
print("Tables in the Sakila Database:")
print(tables)

# Display the sqlite_master table information
master_file

Tables in the Sakila Database:
['actor', 'country', 'city', 'address', 'language', 'category', 'customer', 'film', 'film_actor', 'film_category', 'film_text', 'inventory', 'staff', 'store', 'payment', 'rental']

query = '''
select 'Total Customers' as Metric, count(distinct customer_id) as count from customer
UNION ALL
select 'Total Stores' as Metric, count(distinct store_id) as count from store
'''
total = pd.read_sql(query, conn)
total

query = '''
select t3.title as movie, count(t3.title) as rented_count
from inventory t1 left join rental t2 on t1.inventory_id = t2.inventory_id
left join film t3 on t1.film_id = t3.film_id
group by t3.title
order by 2 desc
limit 10
'''
top_rented_films = pd.read_sql(query, conn)
top_rented_films

query = '''
select rating, count(rating) as movie_count from film
group by rating
order by movie_count desc
'''
rating_count = pd.read_sql(query, conn)
rating_count

query = '''
select first_name || " " || last_name as actor_name,
        count(t2.actor_id) as appeared_count
from film t1 join film_actor t2 on t1.film_id = t2.film_id
left join actor t3 on t2.actor_id = t3.actor_id
group by t2.actor_id
order by appeared_count desc
limit 5
'''
top_appearances = pd.read_sql(query, conn)
top_appearances

query = '''
select rental_rate, (count(rental_rate) * 100.0 / (SELECT count(*) FROM film)) || '%' as distribution
from film
group by rental_rate
'''
rental_rate_distr = pd.read_sql(query, conn)
rental_rate_distr

query = '''
select t3.customer_id, round(avg(t1.length),2) || " mins" as avg_length,
        round(avg(t1.rental_duration),2) || " days" as avg_rental_duration,
        round(avg(t4.amount),2) as avg_spend_usd
from film t1 left join inventory t2 on t1.film_id = t2.film_id
left join rental t3 on t2.inventory_id =  t3.inventory_id
left join payment t4 on t3.rental_id =  t4.rental_id
where t3.customer_id is not null
group by t3.customer_id
order by avg_spend_usd desc

'''
avg_duration_spent = pd.read_sql(query, conn)
avg_duration_spent.head()

avg_duration_spent.tail()

query = '''
WITH cte AS (
    SELECT t4.city, t5.country, t6.amount
    FROM rental t1
    JOIN customer t2 ON t1.customer_id = t2.customer_id
    JOIN address t3 ON t2.address_id = t3.address_id
    JOIN city t4 ON t3.city_id = t4.city_id
    JOIN country t5 ON t4.country_id = t5.country_id
    LEFT JOIN payment t6 ON t1.rental_id = t6.rental_id
),
city_cte AS (
    SELECT 'City' AS Geography, city AS Location, "$" || SUM(amount) AS revenue
    FROM cte
    GROUP BY city
    ORDER BY SUM(amount) DESC
    LIMIT 1
),
country_cte AS (
    SELECT 'Country' AS Geography, country AS Location, "$" || SUM(amount) AS revenue
    FROM cte
    GROUP BY country
    ORDER BY SUM(amount) DESC
    LIMIT 1
)

SELECT * FROM city_cte
UNION ALL
SELECT * FROM country_cte;
'''
location_revenue = pd.read_sql(query, conn)
location_revenue

query = '''
select strftime("%Y-%m", t1.rental_date) as month,
       sum(t2.amount) as revenue_usd
from rental t1 left join payment t2 on t1.rental_id = t2.rental_id
group by strftime("%Y-%m", t1.rental_date)
order by month asc
'''
monthtly_revenue = pd.read_sql(query, conn)
monthtly_revenue

query = '''
select strftime("%Y-%m-%d", rental_date) as rental_date,
        round(count(distinct rental_id)*1.0/count(distinct customer_id),2) as avg_rental
from rental
group by strftime("%Y-%m-%d", rental_date)
order by 1 asc

'''
avg_rental = pd.read_sql(query, conn)
avg_rentals = avg_rental['avg_rental'].mean()
print(f"People make an avg rental of {avg_rentals:.2f}")

People make an avg rental of 1.37

ax = avg_rental.plot(kind='box', title='Box Plot of Rental Distribution')

# Get the median and other statistics
median = avg_rental['avg_rental'].median()
q1 = avg_rental['avg_rental'].quantile(0.25)
q3 = avg_rental['avg_rental'].quantile(0.75)

# Add text labels for key statistics
ax.text(1.1, median, 'Median', ha='center', color='RED')
ax.text(1.1, q1, 'Q1', ha='center', color='C0')
ax.text(1.1, q3, 'Q3', ha='center', color='C0')

plt.show()

query = '''
with cte as (select "Most Rented" as category,
                    t1.customer_id,
                    count(t1.customer_id) as rental_frequency,
                    sum(t2.amount) as total_spend_usd
            from rental t1 join payment t2 on t1.rental_id = t2.rental_id
            group by t1.customer_id
            order by rental_frequency desc
            limit 1),

cte_1 as (select "Highest Spend" as category,
                t1.customer_id,
                count(t1.customer_id) as rental_frequency,
                sum(t2.amount) as total_spend_usd
from rental t1 join payment t2 on t1.rental_id = t2.rental_id
group by t1.customer_id
order by total_spend_usd desc
limit 1)

select * from cte
UNION ALL
select * from cte_1

'''
valuable_customers = pd.read_sql(query, conn)
valuable_customers

query = '''
with cte as (
SELECT t1.customer_id, SUM(t2.amount) AS total_spent,
    ROUND(AVG(t2.amount),2) AS avg_spend_per_rental,
    COUNT(t1.rental_id) AS total_rentals,
    ROUND(COUNT(t1.rental_id) / ((JULIANDAY('now') - JULIANDAY(MIN(t1.rental_date))) / 365.25),2) AS rentals_per_year,
    MIN(t1.rental_date) AS first_rental_date,
    ROUND((JULIANDAY('now') - JULIANDAY(MIN(t1.rental_date))) / 365.25, 2) AS customer_tenure
FROM rental t1 left join payment t2 on t1.rental_id = t2.rental_id
GROUP BY t1.customer_id)

select customer_id,
    ROUND((avg_spend_per_rental * rentals_per_year * customer_tenure),2) as clv
from cte
order by clv desc
'''
clv = pd.read_sql(query, conn)
clv.head()

#customers with lowest clv
clv.tail()

query = '''
WITH rentals_extended AS (
    SELECT
        *,
         CASE strftime('%m', rental_date)
            WHEN '01' THEN 'January'
            WHEN '02' THEN 'February'
            WHEN '03' THEN 'March'
            WHEN '04' THEN 'April'
            WHEN '05' THEN 'May'
            WHEN '06' THEN 'June'
            WHEN '07' THEN 'July'
            WHEN '08' THEN 'August'
            WHEN '09' THEN 'September'
            WHEN '10' THEN 'October'
            WHEN '11' THEN 'November'
            WHEN '12' THEN 'December'
        END AS rental_month
    FROM rental
)

SELECT rental_month, COUNT(*) AS rental_count
FROM rentals_extended
GROUP BY rental_month
ORDER BY rental_count desc
limit 1
'''
rental_month_df = pd.read_sql(query, conn)
rental_month_df

query = '''
WITH rentals_extended AS (
    SELECT
        *,
        CAST(strftime('%H', rental_date) AS INTEGER) AS rental_hour
    FROM rental
)

SELECT rental_hour, COUNT(*) AS rental_count
FROM rentals_extended
GROUP BY rental_hour
ORDER BY rental_count desc
limit 1
'''
rental_hour_df = pd.read_sql(query, conn)
rental_hour_df

query = '''
WITH rentals_extended AS (
    SELECT
        *,CASE strftime('%w', rental_date)
            WHEN '0' THEN 'Sunday'
            WHEN '1' THEN 'Monday'
            WHEN '2' THEN 'Tuesday'
            WHEN '3' THEN 'Wednesday'
            WHEN '4' THEN 'Thursday'
            WHEN '5' THEN 'Friday'
            WHEN '6' THEN 'Saturday'
        END AS rental_day
    FROM rental
)

SELECT rental_day, COUNT(*) AS rental_count
FROM rentals_extended
GROUP BY rental_day
ORDER BY rental_count desc
LIMIT 1
'''
rental_day_df = pd.read_sql(query, conn)
rental_day_df

query = '''
with cte as (SELECT
  t5.name AS category_name,
  COUNT(t5.name) AS total_rentals,
  COUNT(DISTINCT t1.rental_date) AS active_rental_days,
  ROUND(COUNT(t5.name) * 1.0 / NULLIF(COUNT(DISTINCT t1.rental_date), 0),4) AS performance_score
FROM rental t1
LEFT JOIN inventory t2 ON t1.inventory_id = t2.inventory_id
LEFT JOIN film t3 ON t2.film_id = t3.film_id
LEFT JOIN film_category t4 ON t3.film_id = t4.film_id
LEFT JOIN category t5 ON t4.category_id = t5.category_id
GROUP BY t5.name)

select *, dense_rank() over(order by performance_score desc) as performance_rank
from cte
order by performance_rank asc
'''
category_performance = pd.read_sql(query, conn)
category_performance

query = '''
WITH film_rental_data AS (
  SELECT
    f.film_id,
    f.length,
    f.rating,
    f.replacement_cost,
    COUNT(i.inventory_id) AS rental_frequency
  FROM film f
  LEFT JOIN inventory i ON f.film_id = i.film_id
  GROUP BY f.film_id, f.length, f.rating, f.replacement_cost
),
stats AS (
  SELECT
    AVG(length) AS length_mean,
    AVG((length - (SELECT AVG(length) FROM film_rental_data)) *
        (length - (SELECT AVG(length) FROM film_rental_data))) AS length_variance,
    AVG(replacement_cost) AS replacement_cost_mean,
    AVG((replacement_cost - (SELECT AVG(replacement_cost) FROM film_rental_data)) *
        (replacement_cost - (SELECT AVG(replacement_cost) FROM film_rental_data))) AS replacement_cost_variance,
    AVG(rental_frequency) AS rental_frequency_mean,
    AVG((rental_frequency - (SELECT AVG(rental_frequency) FROM film_rental_data)) *
        (rental_frequency - (SELECT AVG(rental_frequency) FROM film_rental_data))) AS rental_frequency_variance
  FROM film_rental_data
),
rating_numeric AS (
  SELECT film_id,
         CASE rating
           WHEN 'G' THEN 1
           WHEN 'PG' THEN 2
           WHEN 'PG-13' THEN 3
           WHEN 'R' THEN 4
           WHEN 'NC-17' THEN 5
           ELSE NULL
         END AS rating_numeric
  FROM film
),
correlation_calculations AS (
  SELECT
    SUM((length - (SELECT length_mean FROM stats)) *
        (rental_frequency - (SELECT rental_frequency_mean FROM stats))) /
        ((SELECT length_variance FROM stats) * (SELECT rental_frequency_variance FROM stats) * (COUNT(*) - 1)) AS length_correlation,

    SUM((COALESCE(rating_numeric, 0) - (SELECT AVG(rating_numeric) FROM rating_numeric)) *
        (rental_frequency - (SELECT rental_frequency_mean FROM stats))) /
        (SQRT((SELECT AVG((COALESCE(rating_numeric, 0) - (SELECT AVG(rating_numeric) FROM rating_numeric)) *
                          (COALESCE(rating_numeric, 0) - (SELECT AVG(rating_numeric) FROM rating_numeric)))
                FROM rating_numeric)) *
         (SELECT rental_frequency_variance FROM stats) * (COUNT(*) - 1)) AS rating_correlation,

    SUM((replacement_cost - (SELECT replacement_cost_mean FROM stats)) *
        (rental_frequency - (SELECT rental_frequency_mean FROM stats))) /
        ((SELECT replacement_cost_variance FROM stats) *
         (SELECT rental_frequency_variance FROM stats) * (COUNT(*) - 1)) AS replacement_cost_correlation
  FROM film_rental_data
  LEFT JOIN rating_numeric USING (film_id)
)

SELECT * FROM correlation_calculations;


'''
corr_df = pd.read_sql(query, conn)
corr_df

query = '''
select t1.store_id,
    sum(t4.amount) as revenue_generated,
    count(t1.store_id) as rental_count,
    round(sum(t4.amount)/count(t1.store_id),2) as revenue_per_rental
from store t1 left join inventory t2 on t1.store_id = t2.store_id
left join rental t3 on t2.inventory_id = t3.inventory_id
left join payment t4 on t3.rental_id = t4.rental_id
group by t1.store_id
order by t1.store_id
'''
store_performance = pd.read_sql(query, conn)
store_performance

query = '''
with rfm_data as (select t1.customer_id,
    round(JULIANDAY('now') - JULIANDAY(MAX(t1.rental_date)),2) AS recency,
    count(t1.rental_id) as frequency,
    sum(t2.amount) as monetary_value
from rental t1 join payment t2 on t1.rental_id = t2.rental_id
group by t1.customer_id)

SELECT *,
CASE WHEN recency <= 30 THEN 'R1'   -- Recency: most recent
        WHEN recency BETWEEN 31 AND 60 THEN 'R2'
        WHEN recency BETWEEN 61 AND 90 THEN 'R3'
        ELSE 'R4'  -- Recency: least recent
    END AS recency_score,

    CASE
        WHEN frequency >= 10 THEN 'F1'  -- Frequency: most frequent
        WHEN frequency BETWEEN 5 AND 9 THEN 'F2'
        ELSE 'F3'  -- Frequency: least frequent
    END AS frequency_score,

    CASE
        WHEN monetary_value >= 500 THEN 'M1'  -- Monetary: high spenders
        WHEN monetary_value BETWEEN 200 AND 499 THEN 'M2'
        ELSE 'M3'  -- Monetary: low spenders
    END AS monetary_score
from rfm_data
ORDER BY recency, frequency, monetary_value
'''
rfm_analysis = pd.read_sql(query, conn)
rfm_analysis.head()

query = '''
select t5.first_name, count(t5.first_name) as rental_count,
    sum(t4.amount) as revenue_generated,
    round(sum(t4.amount)/count(t5.first_name),2) as revenue_per_rental
from store t1 left join inventory t2 on t1.store_id = t2.store_id
left join rental t3 on t2.inventory_id = t3.inventory_id
left join payment t4 on t3.rental_id = t4.rental_id
left join staff t5 on t1.store_id = t5.store_id
group by t5.first_name
'''
staff_performance = pd.read_sql(query, conn)
staff_performance

query = '''

WITH cte AS (
    SELECT t4.name,
           COUNT(t5.rental_id) AS total_rentals,
           COUNT(DISTINCT t2.inventory_id) AS total_inventory,
           ROUND(COUNT(t5.rental_id) * 1.0 / COUNT(DISTINCT t2.inventory_id), 2) AS inventory_turnover_rate,
           SUM(t6.amount) AS total_revenue,  -- Total revenue generated per category
           ROUND(SUM(t6.amount) / COUNT(DISTINCT t2.inventory_id), 2) AS revenue_per_inventory
    FROM film t1
    LEFT JOIN inventory t2 ON t1.film_id = t2.film_id
    LEFT JOIN film_category t3 ON t2.film_id = t3.film_id
    LEFT JOIN category t4 ON t3.category_id = t4.category_id
    LEFT JOIN rental t5 ON t2.inventory_id = t5.inventory_id
    LEFT JOIN payment t6 ON t5.rental_id = t6.rental_id
    WHERE t4.name IS NOT NULL
    GROUP BY t4.name
),

avg_inventory AS (
    SELECT AVG(inventory_turnover_rate) AS avg_inv
    FROM cte
),

variance_calc AS (
    SELECT name,
           total_rentals,
           total_inventory,
           inventory_turnover_rate,
           total_revenue,
           revenue_per_inventory,
           (inventory_turnover_rate - avg_inv) * (inventory_turnover_rate - avg_inv) AS deviation_squared
    FROM cte
    CROSS JOIN avg_inventory
),

std_dev AS (
    SELECT SQRT(AVG(deviation_squared)) AS stddev_turnover_rate
    FROM variance_calc
),

classification AS (
    SELECT v.name,
           v.total_rentals,
           v.total_inventory,
           v.inventory_turnover_rate,
           v.total_revenue,
           v.revenue_per_inventory,
           a.avg_inv,
           round(s.stddev_turnover_rate,2) as stddev_turnover_rate,
           CASE
               WHEN v.inventory_turnover_rate >= a.avg_inv + s.stddev_turnover_rate THEN 'High'
               WHEN v.inventory_turnover_rate <= a.avg_inv - s.stddev_turnover_rate THEN 'Low'
               ELSE 'Medium'
           END AS classification
    FROM variance_calc v
    CROSS JOIN avg_inventory a
    CROSS JOIN std_dev s
)

SELECT name, classification,
total_inventory,total_rentals,
inventory_turnover_rate,
round(avg_inv,2) as avg_inventory_rate,
total_revenue,
revenue_per_inventory

FROM classification
ORDER BY inventory_turnover_rate desc, name;
'''
inventory_turnover = pd.read_sql(query, conn)
inventory_turnover

query = '''
WITH customer_rental_span AS (
    SELECT
        t1.customer_id,
        CAST(MAX(julianday(t2.rental_date)) - MIN(julianday(t2.rental_date)) AS INTEGER) AS rental_period_days,
        COUNT(t2.rental_id) AS rental_count
    FROM customer t1
    LEFT JOIN rental t2 ON t1.customer_id = t2.customer_id
    GROUP BY t1.customer_id
)

SELECT
    customer_id,
    rental_count,
    rental_period_days,
    CASE
        WHEN rental_period_days > 180 THEN 'Potential Churn'
        ELSE 'Active'
    END AS churn_status
FROM
    customer_rental_span
ORDER BY
    rental_period_days DESC;
'''
churn_df = pd.read_sql(query, conn)
churn_df.head()

#Active customers
churn_df.query('churn_status == "Active"').head()

query = '''
WITH customer_rental_span AS (
    SELECT
        t1.customer_id,
        CAST(MAX(julianday(t2.rental_date)) - MIN(julianday(t2.rental_date)) AS INTEGER) AS rental_period_days,
        COUNT(t2.rental_id) AS rental_count
    FROM customer t1
    LEFT JOIN rental t2 ON t1.customer_id = t2.customer_id
    GROUP BY t1.customer_id
),

churn_cte as (SELECT
    customer_id,
    rental_count,
    rental_period_days,
    CASE
        WHEN rental_period_days > 180 THEN 'Potential Churn'
        ELSE 'Active'
    END AS churn_status
FROM customer_rental_span
ORDER BY rental_period_days DESC)

select strftime("%Y-%m", t2.rental_date) as month,
    'Churned Customer' as category,
    count(t2.customer_id) as rental_count
from rental t2
where t2.customer_id in (select customer_id
                         from churn_cte
                         where churn_status = 'Potential Churn')
  AND t2.rental_date IS NOT NULL
group by strftime("%Y-%m", t2.rental_date)
order by month asc;
'''
churned_df_rental_count = pd.read_sql(query, conn)
churned_df_rental_count

query = '''
WITH customer_rental_span AS (
    SELECT
        t1.customer_id,
        CAST(MAX(julianday(t2.rental_date)) - MIN(julianday(t2.rental_date)) AS INTEGER) AS rental_period_days,
        COUNT(t2.rental_id) AS rental_count
    FROM customer t1
    LEFT JOIN rental t2 ON t1.customer_id = t2.customer_id
    GROUP BY t1.customer_id
),

churn_cte as (SELECT
    customer_id,
    rental_count,
    rental_period_days,
    CASE
        WHEN rental_period_days > 180 THEN 'Potential Churn'
        ELSE 'Active'
    END AS churn_status
FROM customer_rental_span
ORDER BY rental_period_days DESC),


final as (select t2.customer_id,
                'Churned Customer' as category,
            sum(case when strftime("%Y-%m", t2.rental_date) = '2005-08' then 1 else 0 end) as aug_rental_count,
            sum(case when strftime("%Y-%m", t2.rental_date) = '2006-02' then 1 else 0 end) as feb_rental_count
from rental t2
where t2.customer_id in (select customer_id
                         from churn_cte
                         where churn_status = 'Potential Churn')
      AND t2.rental_date IS NOT NULL
group by t2.customer_id)

select *, round((aug_rental_count - feb_rental_count)*100.0/aug_rental_count,2) as percentage_change
from final
order by percentage_change desc
'''
churn_customer_rental_count = pd.read_sql(query, conn)
#table of churned customer sorted by percentage change, Highest change is on top
churn_customer_rental_count.head()

#Table of churned customer sorted by percentage change, lowest change is at the bottom
churn_customer_rental_count.tail()

query = '''
WITH total_customers AS (
    SELECT
        t2.store_id,
        t1.customer_id,
        COUNT(t1.rental_id) AS rental_count
    FROM rental t1
    LEFT JOIN inventory t2 ON t1.inventory_id = t2.inventory_id
    GROUP BY t2.store_id, t1.customer_id
),
repeat_customers AS (
    SELECT
        store_id,
        COUNT(DISTINCT customer_id) AS repeat_customer_count
    FROM total_customers
    WHERE rental_count > 1
    GROUP BY store_id
),
store_customers AS (
    SELECT
        store_id,
        COUNT(DISTINCT customer_id) AS total_customer_count
    FROM total_customers
    GROUP BY store_id
),
avg_rental_duration_stats AS (
    SELECT
        t2.store_id,
        AVG(julianday(t1.return_date) - julianday(t1.rental_date)) AS avg_rental_duration,
        -- Calculating variance
        AVG((julianday(t1.return_date) - julianday(t1.rental_date) -
             (SELECT AVG(julianday(t1.return_date) - julianday(t1.rental_date))
              FROM rental t1
              JOIN inventory t2 ON t1.inventory_id = t2.inventory_id
              WHERE t2.store_id = store_id)) *
             (julianday(t1.return_date) - julianday(t1.rental_date) -
             (SELECT AVG(julianday(t1.return_date) - julianday(t1.rental_date))
              FROM rental t1
              JOIN inventory t2 ON t1.inventory_id = t2.inventory_id
              WHERE t2.store_id = store_id))
        ) AS variance_rental_duration,
        -- Calculate standard deviation as the square root of variance
        ROUND(SQRT(AVG((julianday(t1.return_date) - julianday(t1.rental_date) -
             (SELECT AVG(julianday(t1.return_date) - julianday(t1.rental_date))
              FROM rental t1
              JOIN inventory t2 ON t1.inventory_id = t2.inventory_id
              WHERE t2.store_id = store_id)) *
             (julianday(t1.return_date) - julianday(t1.rental_date) -
             (SELECT AVG(julianday(t1.return_date) - julianday(t1.rental_date))
              FROM rental t1
              JOIN inventory t2 ON t1.inventory_id = t2.inventory_id
              WHERE t2.store_id = store_id)))), 2) AS avg_rental_duration_variation
    FROM rental t1
    LEFT JOIN inventory t2 ON t1.inventory_id = t2.inventory_id
    GROUP BY t2.store_id
),
inventory_utilization AS (
    SELECT
        t2.store_id,
        COUNT(t1.rental_id) AS total_rentals,
        COUNT(DISTINCT t2.inventory_id) AS total_inventory,
        ROUND(COUNT(t1.rental_id) * 100.0 / COUNT(DISTINCT t2.inventory_id), 2) AS inventory_utilization_rate
    FROM rental t1
    LEFT JOIN inventory t2 ON t1.inventory_id = t2.inventory_id
    GROUP BY t2.store_id
)

SELECT
    t2.store_id,
    SUM(t3.amount) AS revenue,
    COUNT(t2.store_id) AS rentals,
    ars.avg_rental_duration,
    ars.avg_rental_duration_variation,
    COUNT(DISTINCT t2.film_id) AS variety_count,
    ROUND(SUM(t3.amount) / COUNT(DISTINCT t2.film_id), 2) AS revenue_per_unique_film,
    COUNT(DISTINCT t2.inventory_id) AS inventory,
    ROUND(SUM(t3.amount) / COUNT(t2.store_id), 2) AS revenue_per_rental,
    sc.total_customer_count,
    rc.repeat_customer_count,
    ROUND(rc.repeat_customer_count * 100.0 / sc.total_customer_count, 2) AS repeat_rental_rate,
    ROUND(SUM(t3.amount) / sc.total_customer_count, 2) AS revenue_per_customer,
    iu.inventory_utilization_rate
FROM rental t1
LEFT JOIN inventory t2 ON t1.inventory_id = t2.inventory_id
LEFT JOIN payment t3 ON t1.rental_id = t3.rental_id
LEFT JOIN store_customers sc ON t2.store_id = sc.store_id
LEFT JOIN repeat_customers rc ON t2.store_id = rc.store_id
LEFT JOIN avg_rental_duration_stats ars ON t2.store_id = ars.store_id
LEFT JOIN inventory_utilization iu ON t2.store_id = iu.store_id
GROUP BY t2.store_id
ORDER BY revenue DESC;
'''
store_scorecard = pd.read_sql(query, conn).T.rename(columns={0: 'store_id_2', 1: 'store_id_1'})\
                    .apply(lambda x: round(x,2))\
                    .reset_index()\
                    .rename(columns={'index': 'metric'})
store_scorecard

	movie	rented_count
0	BUCKET BROTHERHOOD	34
1	ROCKETEER MOTHER	33
2	SCALAWAG DUCK	32
3	RIDGEMONT SUBMARINE	32
4	JUGGLER HARDLY	32
5	GRIT CLOCKWORK	32
6	FORWARD TEMPLE	32
7	ZORRO ARK	31
8	WIFE TURN	31
9	TIMBERLAND SKY	31

	actor_name	appeared_count
0	GINA DEGENERES	42
1	WALTER TORN	41
2	MARY KEITEL	40
3	MATTHEW CARREY	39
4	SANDRA KILMER	37

	rental_rate	distribution
0	0.99	34.1%
1	2.99	32.3%
2	4.99	33.6%

	customer_id	avg_length	avg_rental_duration	avg_spend_usd
0	187	123.21 mins	4.64 days	5.70
1	433	117.36 mins	4.56 days	5.35
2	321	116.0 mins	4.45 days	5.31
3	542	101.89 mins	4.17 days	5.27
4	259	114.47 mins	4.84 days	5.27

	customer_id	avg_length	avg_rental_duration	avg_spend_usd
594	115	119.03 mins	4.9 days	3.06
595	381	113.14 mins	5.43 days	3.05
596	64	106.52 mins	4.85 days	3.05
597	395	97.16 mins	5.21 days	3.04
598	252	117.27 mins	4.68 days	2.94

Sakila DVD Rental Data Analysis¶

Project Summary¶

Purpose¶

Key Questions Addressed¶

Notebook Overview¶

Code Content¶

Outputs¶

Technologies and Methods¶

Potential Applications¶

How many total customers are there? and how many stores does Sakila DVD Rental operate?¶

What are the top 10 most frequently rented films?¶

How many films are available in each rating category (r, PG, PG-13, G, etc.)?¶

Which actors appear in the most films?¶

What is the distribution of film rental rates in the inventory?¶

What is the average rental duration, average length of movies and amount spent per customer?¶

What cities and countries generate the most revenue?¶

What is the monthly revenue trend throughout the available time period?¶

How many rentals do customers make on average, and what's the distribution?¶

Which customers are most valuable based on their rental frequency and total spending?¶

Calculate the customer lifetime value (CLV) for each customer, accounting for their rental history and spending patterns.¶

What are the peak rental periods (time of day, day of week, month) that might indicate seasonal patterns?¶

How does film performance vary by category, and which categories should we stock more of?¶

What is the correlation between film characteristics (length, rating, replacement cost) and rental frequency?¶

Which stores are performing better, and what factors might contribute to their success?¶

Create a customer segmentation analysis based on recency, frequency, and monetary value (RFM analysis).¶

Analyze the relationship between staff performance (rental facilitation) and store revenue.¶

What is the inventory turnover rate for different film categories, and how should we optimize our inventory?¶

Calculate customer churn rate and identify patterns among customers who eventually churn.¶

Observations and Recommendations¶

Observation 1¶

Observation 2¶

Observation 3¶

Develop a comprehensive store performance scorecard that considers multiple KPIs (revenue, customer satisfaction, inventory efficiency, etc.). how should we optimize our inventory?¶

	name	type
0	actor	table
1	country	table
2	city	table
3	address	table
4	language	table
5	category	table
6	customer	table
7	film	table
8	film_actor	table
9	film_category	table
10	film_text	table
11	inventory	table
12	staff	table
13	store	table
14	payment	table
15	rental	table

	month	revenue_usd
0	2005-05	4823.44
1	2005-06	9629.89
2	2005-07	28368.91
3	2005-08	24070.14
4	2006-02	514.18

	category_name	total_rentals	active_rental_days	performance_score	performance_rank
0	Animation	1166	1145	1.0183	1
1	Action	1112	1096	1.0146	2
2	Horror	846	834	1.0144	3
3	Games	969	956	1.0136	4
4	New	940	928	1.0129	5
5	Sports	1179	1164	1.0129	5
6	Music	830	820	1.0122	6
7	Family	1096	1084	1.0111	7
8	Travel	837	828	1.0109	8
9	Foreign	1033	1023	1.0098	9
10	Classics	939	930	1.0097	10
11	Comedy	941	933	1.0086	11
12	Drama	1060	1053	1.0066	12
13	Sci-Fi	1101	1094	1.0064	13
14	Children	945	940	1.0053	14
15	Documentary	1050	1045	1.0048	15

	customer_id	recency	frequency	monetary_value	recency_score	frequency_score	monetary_score
0	315	6884.16	17	84.83	R4	F1	M3
1	525	6884.16	19	74.81	R4	F1	M3
2	548	6884.16	19	79.81	R4	F1	M3
3	162	6884.16	20	71.80	R4	F1	M3
4	355	6884.16	20	72.80	R4	F1	M3

	name	classification	total_inventory	total_rentals	inventory_turnover_rate	avg_inventory_rate	total_revenue	revenue_per_inventory
0	Music	High	232	830	3.58	3.5	3417.72	14.73
1	Documentary	High	294	1050	3.57	3.5	4217.52	14.35
2	Action	High	312	1112	3.56	3.5	4375.85	14.03
3	Travel	High	235	837	3.56	3.5	3549.64	15.10
4	Family	Medium	310	1096	3.54	3.5	4226.07	13.63
5	Drama	Medium	300	1060	3.53	3.5	4587.39	15.29
6	Sci-Fi	Medium	312	1101	3.53	3.5	4756.98	15.25
7	Children	Medium	269	945	3.51	3.5	3655.55	13.59
8	Games	Medium	276	969	3.51	3.5	4281.33	15.51
9	Comedy	Medium	269	941	3.50	3.5	4383.58	16.30
10	Animation	Medium	335	1166	3.48	3.5	4656.30	13.90
11	Classics	Medium	270	939	3.48	3.5	3639.59	13.48
12	Foreign	Low	300	1033	3.44	3.5	4270.67	14.24
13	Sports	Low	344	1179	3.43	3.5	5314.21	15.45
14	New	Low	275	940	3.42	3.5	4351.62	15.82
15	Horror	Low	248	846	3.41	3.5	3722.54	15.01

	customer_id	rental_count	rental_period_days	churn_status
0	44	26	265	Potential Churn
1	53	30	265	Potential Churn
2	73	24	265	Potential Churn
3	100	24	265	Potential Churn
4	120	32	265	Potential Churn

	customer_id	rental_count	rental_period_days	churn_status
158	19	24	90	Active
159	25	29	90	Active
160	37	23	90	Active
161	86	33	90	Active
162	88	21	90	Active

	month	category	rental_count
0	2005-05	Churned Customer	299
1	2005-06	Churned Customer	618
2	2005-07	Churned Customer	1785
3	2005-08	Churned Customer	1482
4	2006-02	Churned Customer	182

	customer_id	category	aug_rental_count	feb_rental_count	percentage_change
0	21	Churned Customer	17	1	94.12
1	410	Churned Customer	17	1	94.12
2	29	Churned Customer	16	1	93.75
3	181	Churned Customer	16	1	93.75
4	274	Churned Customer	16	1	93.75

	customer_id	category	aug_rental_count	feb_rental_count	percentage_change
153	570	Churned Customer	3	1	66.67
154	107	Churned Customer	5	2	60.00
155	208	Churned Customer	5	2	60.00
156	191	Churned Customer	2	1	50.00
157	228	Churned Customer	4	2	50.00

	Metric	count
0	Total Customers	599
1	Total Stores	2

	rating	movie_count
0	PG-13	223
1	NC-17	210
2	R	195
3	PG	194
4	G	178

	Geography	Location	revenue
0	City	Cape Coral	$221.55
1	Country	India	$6628.27999999977

	category	customer_id	rental_frequency	total_spend_usd
0	Most Rented	148	46	216.54
1	Highest Spend	526	45	221.55

	metric	store_id_2	store_id_1
0	store_id	2.00	1.00
1	revenue	33726.77	33679.79
2	rentals	8121.00	7923.00
3	avg_rental_duration	5.01	5.04
4	avg_rental_duration_variation	2.58	2.61
5	variety_count	762.00	759.00
6	revenue_per_unique_film	44.26	44.37
7	inventory	2310.00	2270.00
8	revenue_per_rental	4.15	4.25
9	total_customer_count	599.00	599.00
10	repeat_customer_count	599.00	599.00
11	repeat_rental_rate	100.00	100.00
12	revenue_per_customer	56.31	56.23
13	inventory_utilization_rate	351.56	349.03