# manipulating data
import sqlalchemy
import pandas as pd
import warnings
import json

# Plotting and Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import chart_studio.plotly as py
import cufflinks as cf

# Interactive charts
from ipywidgets import interact, fixed, IntSlider
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from IPython.display import display, HTML, Image
from urllib.request import urlopen

# Options
cached_data = {} 
init_notebook_mode(connected = True)
cf.go_offline()
warnings.filterwarnings(action = 'ignore')
pd.options.display.float_format = '{:,.4f}'.format
with urlopen('https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/brazil-states.geojson') as response:
    Brazil = json.load(response) # Javascrip object notation 

# Setup Postgresql database connection
sqlalchemy.create_engine('postgresql://postgres:3ai6ua27hi2h@localhost:5432/olist')
%load_ext sql
%sql postgresql://postgres:3ai6ua27hi2h@localhost:5432/olist
%sql SET SCHEMA 'olist';

 * postgresql://postgres:***@localhost:5432/olist
Done.

[]


%%sql
SELECT * FROM category_names_english ORDER BY product_category LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
SELECT
    COUNT(DISTINCT product_category)
FROM category_names_english;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
SELECT * FROM customers ORDER BY state LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
SELECT 
    COUNT(*) AS no_rows,
    COUNT(DISTINCT customer_id) AS no_customer_id,
    COUNT(DISTINCT customer_unique_id) AS no_unique_customer_id,
    COUNT(DISTINCT city) AS no_distinct_city,
    COUNT(DISTINCT state) AS no_distinct_state
FROM customers;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
SELECT 
    city,
    COUNT(customer_unique_id) AS no_customers
FROM customers
GROUP BY city
ORDER BY no_customers DESC
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


# Extracting data from SQL query
df_cus_city = %sql SELECT city, COUNT(customer_unique_id) AS no_customers FROM customers GROUP BY city ORDER BY no_customers DESC LIMIT 10;
df_cus_city = df_cus_city.DataFrame().sort_values(by = ['no_customers'], ascending = True)

# generating horizontal bar chart
fig = px.bar(df_cus_city, 
             x = 'no_customers',
             y = 'city',
             orientation = 'h',
             title = 'Olist customer counts of the top ten cities in Brazil (2016 - 2018)',
             labels = {'city': 'City name', 'no_customers': 'Customer count'},
             template = 'simple_white',
             height = 700,
             width = 700,
             text_auto = True)
fig.show()

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
WITH cte1 AS( 
    SELECT 
        city,
        COUNT(customer_unique_id) AS no_customers
    FROM customers 
    GROUP BY city 
    HAVING COUNT(customer_unique_id) > 500
    ORDER BY no_customers DESC
) 
SELECT COUNT(*) FROM cte1;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
SELECT 
    state,
    COUNT(customer_unique_id) AS no_customers
FROM customers
GROUP BY state 
ORDER BY no_customers DESC
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
WITH cte AS (
    SELECT
        city,
        COUNT(customer_unique_id) AS no_customers
    FROM customers
    GROUP BY city
    ORDER BY no_customers DESC
)
SELECT 
    city,
    no_customers,
    ROUND(no_customers / SUM(no_customers) OVER() * 100, 2) AS percentage_customer_base,
    ROUND(SUM(no_customers) OVER(ORDER BY no_customers DESC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) / SUM(no_customers) OVER() * 100, 2) AS running_total_percentage
FROM cte
ORDER BY no_customers DESC
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
WITH cte AS (
    SELECT
        state,
        COUNT(customer_unique_id) AS no_customers
    FROM customers
    GROUP BY state
    ORDER BY no_customers DESC
)
SELECT 
    state,
    no_customers,
    ROUND(no_customers / SUM(no_customers) OVER() * 100, 2) AS percentage_customer_base,
    ROUND(SUM(no_customers) OVER(ORDER BY no_customers DESC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) / SUM(no_customers) OVER() * 100, 2) AS running_total_percentage
FROM cte
ORDER BY no_customers DESC
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


# Extracting data from sql query
df_state_shares = %sql SELECT state, COUNT(customer_unique_id) AS no_customers FROM customers GROUP BY state;
df_state_shares = df_state_shares.DataFrame().sort_values(by = ['no_customers'], ascending = False)

# Use `hole` to create a donut-like pie chart
fig = go.Figure(data = [go.Pie(labels = df_state_shares.state, 
                               values = df_state_shares.no_customers, 
                               hole = .5, 
                               pull = [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])])
fig.update_layout(
    title = 'Olist customer shares of all brazillian states (2016-2018)',
    autosize = False,
    width = 600,
    height = 500)
fig.update_traces(textposition = 'inside')
fig.update_layout(uniformtext_minsize = 12, uniformtext_mode = 'hide')

fig.show()

 * postgresql://postgres:***@localhost:5432/olist
27 rows affected.


# Extracting data from sql query
df_cus_loc = %sql SELECT state, COUNT(customer_unique_id) AS no_customers FROM customers GROUP BY state

# Extracting full state names for state abbreviations from geojson set
state_id_map = {}
for feature in Brazil ['features']:
    feature['id'] = feature['properties']['name']
    state_id_map[feature['properties']['sigla']] = feature['id']

# Replacing state abbrevations with full state names
df_cus_loc = df_cus_loc.DataFrame().replace(state_id_map)

# plotting choropleth map
fig = px.choropleth(
    df_cus_loc, 
    locations = 'state', 
    geojson = Brazil, 
    color = 'no_customers', 
    title = 'Geographical distribution of customers',
    labels = {'no_customers': 'Number of customers', 'state': 'State'},
    height = 600,
    width = 1200
)
fig.update_geos(fitbounds = 'locations', visible = False)
fig.show()

 * postgresql://postgres:***@localhost:5432/olist
27 rows affected.


%%sql
SELECT * FROM sellers ORDER BY state LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
SELECT 
    COUNT(*) AS no_rows,
    COUNT(DISTINCT seller_id) AS no_seller_id,
    COUNT(DISTINCT city) AS no_distinct_city,
    COUNT(DISTINCT state) AS no_distinct_state
FROM sellers;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
SELECT 
    city,
    COUNT(seller_id) AS no_sellers
FROM sellers
GROUP BY city
ORDER BY no_sellers DESC
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


# Extracting data from SQL query
df_sel_city = %sql SELECT city, COUNT(seller_id) AS no_sellers FROM sellers GROUP BY city ORDER BY no_sellers DESC LIMIT 10;
df_sel_city = df_sel_city.DataFrame().sort_values(by = ['no_sellers'], ascending = True)

# generating horizontal bar chart
fig = px.bar(df_sel_city, 
             x = 'no_sellers',
             y = 'city',
             orientation = 'h',
             title = 'Olist seller counts of the top ten cities in Brazil (2016 - 2018)',
             labels = {'city': 'City name', 'no_sellers': 'Seller count'},
             template = 'simple_white',
             height = 700,
             width = 700,
             text_auto = True)
fig.show()

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
SELECT 
    state,
    COUNT(seller_id) AS no_sellers
FROM sellers
GROUP BY state
ORDER BY no_sellers DESC
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
WITH cte AS (
    SELECT
        city,
        COUNT(seller_id) AS no_sellers
    FROM sellers
    GROUP BY city
    ORDER BY no_sellers DESC
)
SELECT 
    city,
    no_sellers,
    ROUND(no_sellers / SUM(no_sellers) OVER() * 100, 2) AS percentage_seller_base,
    ROUND(SUM(no_sellers) OVER(ORDER BY no_sellers DESC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) / SUM(no_sellers) OVER() * 100, 2) AS running_total_percentage
FROM cte
ORDER BY no_sellers DESC
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
WITH cte AS (
    SELECT
        state,
        COUNT(seller_id) AS no_sellers
    FROM sellers
    GROUP BY state
    ORDER BY no_sellers DESC
)
SELECT 
    state,
    no_sellers,
    ROUND(no_sellers / SUM(no_sellers) OVER() * 100, 2) AS percentage_seller_base,
    ROUND(SUM(no_sellers) OVER(ORDER BY no_sellers DESC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) / SUM(no_sellers) OVER() * 100, 2) AS running_total_percentage
FROM cte
ORDER BY no_sellers DESC
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


# Extracting data from sql query
df_state_shares = %sql SELECT state, COUNT(seller_id) AS no_sellers FROM sellers GROUP BY state;
df_state_shares = df_state_shares.DataFrame().sort_values(by = ['no_sellers'], ascending = False)

# Use `hole` to create a donut-like pie chart
fig = go.Figure(data = [go.Pie(labels = df_state_shares.state, 
                               values = df_state_shares.no_sellers, 
                               hole = .5, 
                               pull = [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])])
fig.update_layout(
    title = 'Olist seller shares of all brazillian states (2016-2018)',
    autosize = False,
    width = 600,
    height = 500)
fig.update_traces(textposition = 'inside')
fig.update_layout(uniformtext_minsize = 12, uniformtext_mode = 'hide')

fig.show()

 * postgresql://postgres:***@localhost:5432/olist
23 rows affected.


# Extracting data from sql query
df_sel_loc = %sql SELECT state, COUNT(seller_id) AS no_sellers FROM sellers GROUP BY state;

# Replacing state abbrevations with full state names
df_sel_loc = df_sel_loc.DataFrame().replace(state_id_map)

# plotting choropleth map
fig = px.choropleth(
    df_sel_loc, 
    locations = 'state', 
    geojson = Brazil, 
    color = 'no_sellers', 
    title = 'Geographical distribution of customers',
    labels = {'no_sellers': 'Number of sellers', 'state': 'State'},
    height = 600,
    width = 1200
)
fig.update_geos(fitbounds = 'locations', visible = False)
fig.show()

 * postgresql://postgres:***@localhost:5432/olist
23 rows affected.


%%sql
SELECT * FROM products LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
SELECT 
    COUNT(*) AS no_rows,
    COUNT(DISTINCT product_id) AS no_products,
    COUNT(DISTINCT product_category) AS no_categories,
    COUNT(*) - COUNT(product_category) AS no_missing_category,
    COUNT(*) - COUNT(name_length) AS no_missing_name_lgth,
    COUNT(*) - COUNT(description_length) AS no_missing_description_lgth,
    COUNT(*) - COUNT(photos_quantity) AS no_missing_photos_qty,
    COUNT(*) - COUNT(weight_g) AS no_missing_weight,
    COUNT(*) - COUNT(length_cm) AS no_missing_length,
    COUNT(*) - COUNT(height_cm) AS no_misisng_height,
    COUNT(*) - COUNT(width_cm) AS no_missing_width
FROM products;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
SELECT 
    product_category_eng,
    COUNT(product_id) AS no_products
FROM products LEFT JOIN category_names_english USING(product_category)
GROUP BY product_category_eng 
ORDER BY no_products DESC
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
SELECT
    1 AS variable_no,
    'name_length' AS variable,
    COUNT(DISTINCT name_length) AS n,
    ROUND(AVG(name_length), 2) AS mean,
    ROUND(STDDEV_SAMP(name_length), 2) AS sample_stddev,
    MAX(name_length) AS max,
    MIN(name_length) AS min,
    MODE() WITHIN GROUP (ORDER BY name_length) AS mode,
    PERCENTILE_DISC(0.01) WITHIN GROUP (ORDER BY name_length) AS "01_percentile",
    PERCENTILE_DISC(0.25) WITHIN GROUP (ORDER BY name_length) AS "25_percentile",
    PERCENTILE_DISC(0.50) WITHIN GROUP (ORDER BY name_length) AS median,
    PERCENTILE_DISC(0.75) WITHIN GROUP (ORDER BY name_length) AS "75_percentile",
    PERCENTILE_DISC(0.99) WITHIN GROUP (ORDER BY name_length) AS "99_percentile"
FROM products
WHERE name_length IS NOT NULL
UNION
SELECT 
    2 AS variable_no,
    'description_length' AS variable,
    COUNT(DISTINCT description_length) AS n,
    ROUND(AVG(description_length), 2) AS mean,
    ROUND(STDDEV_SAMP(description_length), 2) AS sample_stddev,
    MAX(description_length) AS max,
    MIN(description_length) AS min,
    MODE() WITHIN GROUP (ORDER BY description_length) AS mode,
    PERCENTILE_DISC(0.01) WITHIN GROUP (ORDER BY description_length) AS "01_percentile",
    PERCENTILE_DISC(0.25) WITHIN GROUP (ORDER BY description_length) AS "25_percentile",
    PERCENTILE_DISC(0.50) WITHIN GROUP (ORDER BY description_length) AS median,
    PERCENTILE_DISC(0.75) WITHIN GROUP (ORDER BY description_length) AS "75_percentile",
    PERCENTILE_DISC(0.99) WITHIN GROUP (ORDER BY description_length) AS "99_percentile"
FROM products
WHERE description_length IS NOT NULL
UNION
SELECT 
    3 AS variable_no,
    'photos_quantity' AS variable,
    COUNT(DISTINCT photos_quantity) AS n,
    ROUND(AVG(photos_quantity), 2) AS mean,
    ROUND(STDDEV_SAMP(photos_quantity), 2) AS sample_stddev,
    MAX(photos_quantity) AS max,
    MIN(photos_quantity) AS min,
    MODE() WITHIN GROUP (ORDER BY photos_quantity) AS mode,
    PERCENTILE_DISC(0.01) WITHIN GROUP (ORDER BY photos_quantity) AS "01_percentile",
    PERCENTILE_DISC(0.25) WITHIN GROUP (ORDER BY photos_quantity) AS "25_percentile",
    PERCENTILE_DISC(0.50) WITHIN GROUP (ORDER BY photos_quantity) AS median,
    PERCENTILE_DISC(0.75) WITHIN GROUP (ORDER BY photos_quantity) AS "75_percentile",
    PERCENTILE_DISC(0.99) WITHIN GROUP (ORDER BY photos_quantity) AS "99_percentile"
FROM products
WHERE photos_quantity IS NOT NULL
UNION
SELECT 
    4 AS variable_no,
    'weight_g' AS variable,
    COUNT(DISTINCT weight_g) AS n,
    ROUND(AVG(weight_g), 2) AS mean,
    ROUND(STDDEV_SAMP(weight_g), 2) AS sample_stddev,
    MAX(weight_g) AS max,
    MIN(weight_g) AS min,
    MODE() WITHIN GROUP (ORDER BY weight_g) AS mode,
    PERCENTILE_DISC(0.01) WITHIN GROUP (ORDER BY weight_g) AS "01_percentile",
    PERCENTILE_DISC(0.25) WITHIN GROUP (ORDER BY weight_g) AS "25_percentile",
    PERCENTILE_DISC(0.50) WITHIN GROUP (ORDER BY weight_g) AS median,
    PERCENTILE_DISC(0.75) WITHIN GROUP (ORDER BY weight_g) AS "75_percentile",
    PERCENTILE_DISC(0.99) WITHIN GROUP (ORDER BY weight_g) AS "99_percentile"
FROM products
WHERE weight_g IS NOT NULL
UNION
SELECT 
    5 AS variable_no,
    'length_cm' AS variable,
    COUNT(DISTINCT length_cm) AS n,
    ROUND(AVG(length_cm), 2) AS mean,
    ROUND(STDDEV_SAMP(length_cm), 2) AS sample_stddev,
    MAX(length_cm) AS max,
    MIN(length_cm) AS min,
    MODE() WITHIN GROUP (ORDER BY length_cm) AS mode,
    PERCENTILE_DISC(0.01) WITHIN GROUP (ORDER BY length_cm) AS "01_percentile",
    PERCENTILE_DISC(0.25) WITHIN GROUP (ORDER BY length_cm) AS "25_percentile",
    PERCENTILE_DISC(0.50) WITHIN GROUP (ORDER BY length_cm) AS median,
    PERCENTILE_DISC(0.75) WITHIN GROUP (ORDER BY length_cm) AS "75_percentile",
    PERCENTILE_DISC(0.99) WITHIN GROUP (ORDER BY length_cm) AS "99_percentile"
FROM products
WHERE length_cm IS NOT NULL
UNION
SELECT 
    6 AS variable_no,
    'height_cm' AS variable,
    COUNT(DISTINCT height_cm) AS n,
    ROUND(AVG(height_cm), 2) AS mean,
    ROUND(STDDEV_SAMP(height_cm), 2) AS sample_stddev,
    MAX(height_cm) AS max,
    MIN(height_cm) AS min,
    MODE() WITHIN GROUP (ORDER BY height_cm) AS mode,
    PERCENTILE_DISC(0.01) WITHIN GROUP (ORDER BY height_cm) AS "01_percentile",
    PERCENTILE_DISC(0.25) WITHIN GROUP (ORDER BY height_cm) AS "25_percentile",
    PERCENTILE_DISC(0.50) WITHIN GROUP (ORDER BY height_cm) AS median,
    PERCENTILE_DISC(0.75) WITHIN GROUP (ORDER BY height_cm) AS "75_percentile",
    PERCENTILE_DISC(0.99) WITHIN GROUP (ORDER BY height_cm) AS "99_percentile"
FROM products
WHERE height_cm IS NOT NULL
UNION
SELECT 
    7 AS variable_no,
    'width_cm' AS variable,
    COUNT(DISTINCT width_cm) AS n,
    ROUND(AVG(width_cm), 2) AS mean,
    ROUND(STDDEV_SAMP(width_cm), 2) AS sample_stddev,
    MAX(width_cm) AS max,
    MIN(width_cm) AS min,
    MODE() WITHIN GROUP (ORDER BY width_cm) AS mode,
    PERCENTILE_DISC(0.01) WITHIN GROUP (ORDER BY width_cm) AS "01_percentile",
    PERCENTILE_DISC(0.25) WITHIN GROUP (ORDER BY width_cm) AS "25_percentile",
    PERCENTILE_DISC(0.50) WITHIN GROUP (ORDER BY width_cm) AS median,
    PERCENTILE_DISC(0.75) WITHIN GROUP (ORDER BY width_cm) AS "75_percentile",
    PERCENTILE_DISC(0.99) WITHIN GROUP (ORDER BY width_cm) AS "99_percentile"
FROM products
WHERE width_cm IS NOT NULL
ORDER BY variable_no;

 * postgresql://postgres:***@localhost:5432/olist
7 rows affected.


# Extracting data from SQl query
df_product_property = %sql SELECT name_length, photos_quantity, length_cm, height_cm, width_cm FROM products;
df_product_property = df_product_property.DataFrame()

# Constructing box plots
# Simple box plots
fig = px.box(df_product_property,
             title = 'Box plots of product properties listed on Olist  (Jul 2016 - Jun 2018)',
             width = 1000,
             height = 600)

# Updating layout to reduce clutter
fig.update_layout(#yaxis = default_yaxis, 
                  autosize = True, plot_bgcolor = 'white')

# Showing the visualization
fig.show()

 * postgresql://postgres:***@localhost:5432/olist
32951 rows affected.


%%sql
SELECT * FROM orders LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
SELECT 
    COUNT(*) AS no_rows,
    COUNT(DISTINCT customer_id) AS no_customers,
    COUNT(*) - COUNT(status) AS no_missing_status,
    COUNT(*) - COUNT(purchase_timestamp) AS no_missing_purchase_timestamp,
    COUNT(*) - COUNT(approval_timestamp) AS no_misisng_approval_timestamp,
    COUNT(*) - COUNT(delivered_carrier_date) AS no_missing_delivered_carrier_timestamp,
    COUNT(*) - COUNT(delivered_customer_date) AS no_missing_delivered_customer_timestamp,
    COUNT(*) - COUNT(estimated_delivery_date) AS no_missing_est_deliver_timestamp
FROM orders;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
WITH cte AS ( 
    SELECT 
        status,
        COUNT(status) AS status_count
    FROM orders
    GROUP BY status
)
SELECT 
    status,
    status_count,
    ROUND(status_count / SUM(status_count) OVER() * 100, 2)  AS status_rate
FROM cte
ORDER BY status_count DESC;

 * postgresql://postgres:***@localhost:5432/olist
8 rows affected.


%%sql
CREATE EXTENSION IF NOT EXISTS tablefunc;
SELECT 
    status,
    ROUND("2016 Q3" / SUM("2016 Q3") OVER() * 100, 2) AS Q3_2016,
    ROUND("2016 Q4" / SUM("2016 Q4") OVER() * 100, 2) AS Q4_2016,
    ROUND("2017 Q1" / SUM("2017 Q1") OVER() * 100, 2) AS Q1_2017,
    ROUND("2017 Q2" / SUM("2017 Q2") OVER() * 100, 2) AS Q2_2017,
    ROUND("2017 Q3" / SUM("2017 Q3") OVER() * 100, 2) AS Q3_2017,
    ROUND("2017 Q4" / SUM("2017 Q4") OVER() * 100, 2) AS Q4_2017,
    ROUND("2018 Q1" / SUM("2018 Q1") OVER() * 100, 2) AS Q1_2018,
    ROUND("2018 Q2" / SUM("2018 Q2") OVER() * 100, 2) AS Q2_2018,
    ROUND("2018 Q3" / SUM("2018 Q3") OVER() * 100, 2) AS Q3_2018,
    ROUND("2018 Q4" / SUM("2018 Q4") OVER() * 100, 2) AS Q4_2018
FROM CROSSTAB($$
    SELECT
        status,
        DATE_PART('year', purchase_timestamp) || ' Q' || DATE_PART('quarter', purchase_timestamp) AS quarter_timestamp,
        COUNT(status) AS status_count
    FROM orders
    GROUP BY status, quarter_timestamp 
    ORDER BY 1, 2
$$) AS ct (status TEXT, "2016 Q3" BIGINT, "2016 Q4" BIGINT, 
                        "2017 Q1" BIGINT, "2017 Q2" BIGINT, "2017 Q3" BIGINT, "2017 Q4" BIGINT, 
                        "2018 Q1" BIGINT, "2018 Q2" BIGINT, "2018 Q3" BIGINT, "2018 Q4" BIGINT);

 * postgresql://postgres:***@localhost:5432/olist
Done.
8 rows affected.


%%sql
SELECT 
    DATE_PART('hour', purchase_timestamp) AS purchase_hour,
    COUNT(order_id)
FROM orders
GROUP BY purchase_hour
ORDER BY purchase_hour;

 * postgresql://postgres:***@localhost:5432/olist
24 rows affected.


# Extract data from SQL
df_daily_purchase = %sql SELECT DATE_PART('hour', purchase_timestamp) AS purchase_hour, COUNT(order_id) FROM orders GROUP BY purchase_hour ORDER BY purchase_hour;
df_daily_purchase = df_daily_purchase.DataFrame()

# Constructing horizontal bar chart
fig = px.bar(df_daily_purchase, 
             x = 'purchase_hour',
             y = 'count',
             title = 'Hourly number of purchases on Olist (2016 - 2018)',
             labels = {'purchase_hour': 'Hour', 'count': 'Number of hourly orders'},
             template = 'simple_white',
             height = 500,
             width = 700,
             text_auto = True)
fig.show()

 * postgresql://postgres:***@localhost:5432/olist
24 rows affected.


%%sql
SELECT 
    CASE WHEN EXTRACT(DOW FROM purchase_timestamp) = 0 THEN '0_Sunday' 
         WHEN EXTRACT(DOW FROM purchase_timestamp) = 1 THEN '1_Monday' 
         WHEN EXTRACT(DOW FROM purchase_timestamp) = 2 THEN '2_Tuesday'
         WHEN EXTRACT(DOW FROM purchase_timestamp) = 3 THEN '3_Wednesday'
         WHEN EXTRACT(DOW FROM purchase_timestamp) = 4 THEN '4_Thursday'
         WHEN EXTRACT(DOW FROM purchase_timestamp) = 5 THEN '5_Friday'
         ELSE '6_Saturday' END AS day_of_week,
    COUNT(order_id) AS order_count
FROM orders
GROUP BY day_of_week
ORDER BY day_of_week;

 * postgresql://postgres:***@localhost:5432/olist
7 rows affected.


# Extract data from SQL
df_weekly_purchase = %sql SELECT EXTRACT(DOW FROM purchase_timestamp) AS purchase_day, COUNT(order_id) FROM orders GROUP BY purchase_day ORDER BY purchase_day;
df_weekly_purchase = df_weekly_purchase.DataFrame().replace({0: 'Sunday', 1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday'})

# Constructing horizontal bar chart
fig = px.bar(df_weekly_purchase, 
             x = 'purchase_day',
             y = 'count',
             title = 'daily number of purchases on Olist (2016 - 2018)',
             labels = {'purchase_day': 'Day of Week', 'count': 'Number of hourly orders'},
             template = 'simple_white',
             height = 500,
             width = 700,
             text_auto = True)
fig.show()

 * postgresql://postgres:***@localhost:5432/olist
7 rows affected.


%%sql
SELECT 
    AVG(approval_timestamp - purchase_timestamp) AS avg_approval_time
FROM orders;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
SELECT
    DATE_PART('year', purchase_timestamp) || ' Q' || DATE_PART('quarter', purchase_timestamp) AS quarter_timestamp,
    COUNT(order_id) AS no_purchases, 
    AVG(approval_timestamp - purchase_timestamp) AS avg_approval_time
FROM orders
GROUP BY quarter_timestamp 
ORDER BY quarter_timestamp;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
SELECT
    AVG(delivered_carrier_date - approval_timestamp) AS average_carrier_delivery_time
FROM orders;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
SELECT
    DATE_PART('year', purchase_timestamp) || ' Q' || DATE_PART('quarter', purchase_timestamp) AS quarter_timestamp,
    COUNT(order_id) AS no_purchases, 
    AVG(delivered_carrier_date - approval_timestamp) AS average_carrier_delivery_time
FROM orders
GROUP BY quarter_timestamp 
ORDER BY quarter_timestamp;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
SELECT
    AVG(delivered_customer_date - delivered_carrier_date) AS average_customer_delivery_time
FROM orders;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
SELECT
    DATE_PART('year', purchase_timestamp) || ' Q' || DATE_PART('quarter', purchase_timestamp) AS quarter_timestamp,
    COUNT(order_id) AS no_purchases, 
    AVG(delivered_customer_date - delivered_carrier_date) AS average_customer_delivery_time
FROM orders
GROUP BY quarter_timestamp 
ORDER BY quarter_timestamp;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
SELECT 
    DATE_PART('hour', delivered_customer_date) AS delivery_hour,
    COUNT(order_id)
FROM orders
WHERE delivered_customer_date IS NOT NULL
GROUP BY delivery_hour
ORDER BY delivery_hour;

 * postgresql://postgres:***@localhost:5432/olist
24 rows affected.


# Extract data from SQL
df_daily_delivery = %sql SELECT DATE_PART('hour', delivered_customer_date) AS delivery_hour, COUNT(order_id) FROM orders GROUP BY delivery_hour ORDER BY delivery_hour;
df_daily_delivery = df_daily_delivery.DataFrame()

# Constructing horizontal bar chart
fig = px.bar(df_daily_delivery, 
             x = 'delivery_hour',
             y = 'count',
             title = 'daily number of purchases on Olist (2016 - 2018)',
             labels = {'delivery_hour': 'Delivery time', 'count': 'Number of Delivery'},
             template = 'simple_white',
             height = 500,
             width = 700,
             text_auto = True)
fig.show()

 * postgresql://postgres:***@localhost:5432/olist
25 rows affected.


%%sql
SELECT 
    CASE WHEN EXTRACT(DOW FROM delivered_customer_date) = 0 THEN '0_Sunday' 
         WHEN EXTRACT(DOW FROM delivered_customer_date) = 1 THEN '1_Monday' 
         WHEN EXTRACT(DOW FROM delivered_customer_date) = 2 THEN '2_Tuesday'
         WHEN EXTRACT(DOW FROM delivered_customer_date) = 3 THEN '3_Wednesday'
         WHEN EXTRACT(DOW FROM delivered_customer_date) = 4 THEN '4_Thursday'
         WHEN EXTRACT(DOW FROM delivered_customer_date) = 5 THEN '5_Friday'
         ELSE '6_Saturday' END AS day_of_week,
    COUNT(order_id) AS delivery_count
FROM orders
GROUP BY day_of_week
ORDER BY day_of_week;

 * postgresql://postgres:***@localhost:5432/olist
7 rows affected.


# Extract data from SQL
df_weekly_delivery = %sql SELECT EXTRACT(DOW FROM delivered_customer_date) AS delivery_day, COUNT(order_id) FROM orders GROUP BY delivery_day ORDER BY delivery_day;
df_weekly_delivery = df_weekly_delivery.DataFrame().replace({0: 'Sunday', 1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday'})

# Constructing horizontal bar chart
fig = px.bar(df_weekly_delivery, 
             x = 'delivery_day',
             y = 'count',
             title = 'Daily number of Olist customer delivery (2016 - 2018)',
             labels = {'purchase_day': 'Day of Week', 'count': 'Number of hourly delivery'},
             template = 'simple_white',
             height = 500,
             width = 700,
             text_auto = True)
fig.show()

 * postgresql://postgres:***@localhost:5432/olist
8 rows affected.


%%sql
SELECT 
    ROUND(SUM(CASE WHEN delivered_customer_date < estimated_delivery_date THEN 1 ELSE 0 END) / COUNT(delivered_customer_date) :: NUMERIC * 100, 2) AS estimate_accuracy
FROM orders;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
SELECT
    DATE_PART('year', purchase_timestamp) || ' Q' || DATE_PART('quarter', purchase_timestamp) AS quarter_timestamp,
    COUNT(delivered_customer_date) AS no_available_delivery_date, 
    ROUND(SUM(CASE WHEN delivered_customer_date < estimated_delivery_date THEN 1 ELSE 0 END) / COUNT(delivered_customer_date) :: NUMERIC * 100, 2) AS estimate_accuracy
FROM orders
GROUP BY quarter_timestamp 
HAVING COUNT(delivered_customer_date) > 0
ORDER BY quarter_timestamp;

 * postgresql://postgres:***@localhost:5432/olist
9 rows affected.


%%sql
SELECT * FROM order_items ORDER BY price DESC LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
SELECT 
    COUNT(DISTINCT order_id) AS order_count, 
    COUNT(DISTINCT product_id) AS product_count,
    ROUND(COUNT(DISTINCT order_id) / COUNT(DISTINCT product_id) :: NUMERIC, 2) AS product_order_ratio
FROM order_items;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
SELECT COUNT(DISTINCT seller_id) / 100 AS number_of_sellers_in_top_1_percent
FROM order_items;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
WITH cte AS (
    SELECT 
        seller_id,
        COUNT(product_id) AS product_count 
    FROM order_items
    GROUP BY seller_id
    ORDER BY product_count DESC
)
SELECT 
    seller_id,
    product_count,
    ROUND(product_count / 
        SUM(product_count) OVER() * 100, 2) AS percentage_product_count,
    ROUND(SUM(product_count) OVER(ORDER BY product_count DESC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) / 
        SUM(product_count) OVER() * 100, 2) AS accumulated_percentage_product_count
FROM cte
LIMIT 30;

 * postgresql://postgres:***@localhost:5432/olist
30 rows affected.


%%sql
WITH cte AS (
    SELECT 
        seller_id,
        SUM(price) AS total_sales_value
    FROM order_items
    GROUP BY seller_id
    ORDER BY total_sales_value DESC
)
SELECT 
    seller_id,
    total_sales_value,
    ROUND(total_sales_value / 
        SUM(total_sales_value) OVER() * 100, 2) AS revenue_share,
    ROUND(SUM(total_sales_value) OVER(ORDER BY total_sales_value DESC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) / 
        SUM(total_sales_value) OVER() * 100, 2) AS accumulated_revenue_share
FROM cte
LIMIT 30;

 * postgresql://postgres:***@localhost:5432/olist
30 rows affected.


%%sql
SELECT
    1 AS variable_no,
    'price' AS variable,
    COUNT(DISTINCT price) AS n,
    ROUND(AVG(price), 2) AS mean,
    ROUND(STDDEV_SAMP(price), 2) AS sample_stddev,
    MAX(price) AS max,
    MIN(price) AS min,
    MODE() WITHIN GROUP (ORDER BY price) AS mode,
    PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY price) AS "01_percentile",
    PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY price) AS "25_percentile",
    PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY price) AS median,
    PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY price) AS "75_percentile",
    PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY price) AS "99_percentile"
FROM order_items
WHERE price IS NOT NULL
UNION
SELECT
    2 AS variable_no,
    'freight_value' AS variable,
    COUNT(DISTINCT freight_value) AS n,
    ROUND(AVG(freight_value), 2) AS mean,
    ROUND(STDDEV_SAMP(freight_value), 2) AS sample_stddev,
    MAX(freight_value) AS max,
    MIN(freight_value) AS min,
    MODE() WITHIN GROUP (ORDER BY freight_value) AS mode,
    PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY freight_value) AS "01_percentile",
    PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY freight_value) AS "25_percentile",
    PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY freight_value) AS median,
    PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY freight_value) AS "75_percentile",
    PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY freight_value) AS "99_percentile"
FROM order_items
WHERE freight_value IS NOT NULL
ORDER BY variable_no;

 * postgresql://postgres:***@localhost:5432/olist
2 rows affected.


%%sql
SELECT * FROM order_payments LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
WITH cte AS (
SELECT
    payment_type,
    COUNT(payment_type) AS payment_count
FROM order_payments
GROUP BY payment_type
ORDER BY payment_count DESC
)
SELECT 
    payment_type,
    payment_count,
    ROUND(payment_count / SUM(payment_count) OVER() * 100, 2) AS share_of_payment_type,
    ROUND(SUM(payment_count) OVER (ORDER BY payment_count DESC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) / SUM(payment_count) OVER() * 100, 2) AS accumulated_share
FROM cte
ORDER BY payment_count DESC;

 * postgresql://postgres:***@localhost:5432/olist
5 rows affected.


# Extracting data from sql query
df_payment_shares = %sql SELECT payment_type, COUNT(payment_type) AS payment_count FROM order_payments GROUP BY payment_type ORDER BY payment_count DESC;
df_payment_shares = df_payment_shares.DataFrame().sort_values(by = ['payment_count'], ascending = False).replace({'credit_card': 'Credit Card', 
                                                                                                                  'boleto': 'Boleto',
                                                                                                                  'voucher': 'Voucher',
                                                                                                                  'debit_card': 'Debit Card',
                                                                                                                  'not_defined': 'Not defined'})

# Use `hole` to create a donut-like pie chart
fig = go.Figure(data = [go.Pie(labels = df_payment_shares.payment_type, 
                               values = df_payment_shares.payment_count, 
                               hole = .5, 
                               pull = [0.2, 0, 0, 0, 0])])
fig.update_layout(
    title = 'Shares of payment types of Olist customers (2016-2018)',
    autosize = False,
    width = 600,
    height = 500)
fig.update_traces(textposition = 'inside')
fig.update_layout(uniformtext_minsize = 12, uniformtext_mode = 'hide')

fig.show()

 * postgresql://postgres:***@localhost:5432/olist
5 rows affected.


%%sql
SELECT
    1 AS variable_no,
    'payment_sequential' AS variable,
    COUNT(DISTINCT payment_sequential) AS n,
    ROUND(AVG(payment_sequential), 2) AS mean,
    ROUND(STDDEV_SAMP(payment_sequential), 2) AS sample_stddev,
    MAX(payment_sequential) AS max,
    MIN(payment_sequential) AS min,
    MODE() WITHIN GROUP (ORDER BY payment_sequential) AS mode,
    PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY payment_sequential) AS "01_percentile",
    PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY payment_sequential) AS "25_percentile",
    PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY payment_sequential) AS median,
    PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY payment_sequential) AS "75_percentile",
    PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY payment_sequential) AS "99_percentile"
FROM order_payments
WHERE payment_sequential IS NOT NULL
UNION
SELECT
    2 AS variable_no,
    'payment_installments' AS variable,
    COUNT(DISTINCT payment_installments) AS n,
    ROUND(AVG(payment_installments), 2) AS mean,
    ROUND(STDDEV_SAMP(payment_installments), 2) AS sample_stddev,
    MAX(payment_installments) AS max,
    MIN(payment_installments) AS min,
    MODE() WITHIN GROUP (ORDER BY payment_installments) AS mode,
    PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY payment_installments) AS "01_percentile",
    PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY payment_installments) AS "25_percentile",
    PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY payment_installments) AS median,
    PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY payment_installments) AS "75_percentile",
    PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY payment_installments) AS "99_percentile"
FROM order_payments
WHERE payment_installments IS NOT NULL
UNION 
SELECT
    3 AS variable_no,
    'payment_value' AS variable,
    COUNT(DISTINCT payment_value) AS n,
    ROUND(AVG(payment_value), 2) AS mean,
    ROUND(STDDEV_SAMP(payment_value), 2) AS sample_stddev,
    MAX(payment_value) AS max,
    MIN(payment_value) AS min,
    MODE() WITHIN GROUP (ORDER BY payment_value) AS mode,
    PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY payment_value) AS "01_percentile",
    PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY payment_value) AS "25_percentile",
    PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY payment_value) AS median,
    PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY payment_value) AS "75_percentile",
    PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY payment_value) AS "99_percentile"
FROM order_payments
WHERE payment_value IS NOT NULL;

 * postgresql://postgres:***@localhost:5432/olist
3 rows affected.


%%sql
SELECT 
    COUNT(order_id)/100 AS number_of_orders_in_top_1_percent
FROM order_payments;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
WITH cte AS ( 
    SELECT 
        order_id,
        SUM(payment_value) AS order_value
    FROM order_payments
    GROUP BY order_id
    ORDER BY order_value DESC
)
SELECT 
    ROW_NUMBER() OVER() AS row_number,
    order_id,
    order_value,
    ROUND(order_value / SUM(order_value) OVER() * 100, 4) AS order_value_share,
    ROUND(SUM(order_value) OVER(ORDER BY order_value DESC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) / SUM(order_value) OVER() * 100, 4) AS accumulated_share
FROM cte
ORDER BY accumulated_share
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
SELECT * FROM order_reviews LIMIT 10;

 * postgresql://postgres:***@localhost:5432/olist
10 rows affected.


%%sql
SELECT
    1 AS variable_no,
    'rating' AS variable,
    COUNT(DISTINCT rating) AS n,
    ROUND(AVG(rating), 2) AS mean,
    ROUND(STDDEV_SAMP(rating), 2) AS sample_stddev,
    MAX(rating) AS max,
    MIN(rating) AS min,
    MODE() WITHIN GROUP (ORDER BY rating) AS mode,
    PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY rating) AS "01_percentile",
    PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY rating) AS "25_percentile",
    PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY rating) AS median,
    PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY rating) AS "75_percentile",
    PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY rating) AS "99_percentile"
FROM order_reviews
WHERE rating IS NOT NULL;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
SELECT 
    AVG(answer_timestamp - creation_timestamp) AS average_answering_time
FROM order_reviews;

 * postgresql://postgres:***@localhost:5432/olist
1 rows affected.


%%sql
SELECT 
    DATE_PART('year', creation_timestamp) || ' Q' || DATE_PART('quarter', creation_timestamp) AS quarter_timestamp,
    COUNT(answer_timestamp) AS no_observations,
    AVG(answer_timestamp - creation_timestamp) AS average_answering_time
FROM order_reviews
GROUP BY quarter_timestamp
ORDER BY quarter_timestamp;

 * postgresql://postgres:***@localhost:5432/olist
8 rows affected.

product_category	product_category_eng
agro_industria_e_comercio	agro_industry_and_commerce
alimentos	food
alimentos_bebidas	food_drink
artes	art
artes_e_artesanato	arts_and_craftmanship
artigos_de_festas	party_supplies
artigos_de_natal	christmas_supplies
audio	audio
automotivo	auto
bebes	baby

customer_id	customer_unique_id	zip_code_prefix	city	state
6b04ea93ce93cef2a5fbdfe302e21331	c73db85789b600b05c503d21441940f9	69918	rio branco	AC
5a0dc6805a195218a4e448653dee4740	89fd593957ef80a12f0588913e78b00d	69915	rio branco	AC
c5de7db594b246f428940ced283125bd	8c792a4fb7f5d708257abf55fa899ce8	69918	rio branco	AC
16aba5570b45f6b4c9204407132e2cae	5f8f12098427d3cbb37e9dcf269bb53c	69918	rio branco	AC
2201362e68992f654942dc0067c1b716	f7d7fc0a59ef4363fdce6e3aa069d498	69900	rio branco	AC
31dbc13addc753e210692eacaea065e4	5dbba6c01268a8ad43f79157bf4454a0	69900	rio branco	AC
ca077dd95290191565abb393a580bbfc	70f326d437e44cf69da00c9169b74fb3	69911	rio branco	AC
5a5f70fead43dc16ec4d5afed5281239	b1cf91f7efd99001549d2eafa05c66c1	69918	rio branco	AC
94906d652f7d1c9e4c4af604ede7784b	544d0f13064dc42b8f6957cf43fa5cad	69918	rio branco	AC
5c495510e9f861b83aa5fb7c308e8d76	28989ef45087c96e5a4346e88216c2ba	69912	rio branco	AC

city	no_customers
sao paulo	15540
rio de janeiro	6882
belo horizonte	2773
brasilia	2131
curitiba	1521
campinas	1444
porto alegre	1379
salvador	1245
guarulhos	1189
sao bernardo do campo	938

state	no_customers
SP	41746
RJ	12852
MG	11635
RS	5466
PR	5045
SC	3637
BA	3380
DF	2140
ES	2033
GO	2020

city	no_customers	percentage_customer_base	running_total_percentage
sao paulo	15540	15.63	15.63
rio de janeiro	6882	6.92	22.55
belo horizonte	2773	2.79	25.34
brasilia	2131	2.14	27.48
curitiba	1521	1.53	29.01
campinas	1444	1.45	30.46
porto alegre	1379	1.39	31.85
salvador	1245	1.25	33.10
guarulhos	1189	1.20	34.30
sao bernardo do campo	938	0.94	35.24

state	no_customers	percentage_customer_base	running_total_percentage
SP	41746	41.98	41.98
RJ	12852	12.92	54.90
MG	11635	11.70	66.61
RS	5466	5.50	72.10
PR	5045	5.07	77.18
SC	3637	3.66	80.83
BA	3380	3.40	84.23
DF	2140	2.15	86.38
ES	2033	2.04	88.43
GO	2020	2.03	90.46

seller_id	zip_code_prefix	city	state
4be2e7f96b4fd749d52dff41f80e39dd	69900	rio branco	AC
327b89b872c14d1c0be7235ef4871685	69005	manaus	AM
53243585a1d6dc2643021fd1853d8905	42738	lauro de freitas	BA
d2e753bb80b7d4faa77483ed00edc8ca	45810	porto seguro	BA
b00af24704019bd2e1b335e70ad11f7c	40130	salvador	BA
4aba391bc3b88717ce08eb11e44937b2	45816	arraial d'ajuda (porto seguro)	BA
1444c08e64d55fb3c25f0f09c07ffcf2	42738	lauro de freitas	BA
fc59392d66ef99377e50356ee4f3b4e1	40243	salvador	BA
43753b27d77860f1654aa72e251a7878	44895	barro alto	BA
4221a7df464f1fe2955934e30ff3a5a1	48602	bahia	BA

city	no_sellers	percentage_seller_base	running_total_percentage
sao paulo	694	22.42	22.42
curitiba	127	4.10	26.53
rio de janeiro	96	3.10	29.63
belo horizonte	68	2.20	31.83
ribeirao preto	52	1.68	33.51
guarulhos	50	1.62	35.12
ibitinga	49	1.58	36.70
santo andre	45	1.45	38.16
campinas	41	1.32	39.48
maringa	40	1.29	40.78

state	no_sellers	percentage_seller_base	running_total_percentage
SP	1849	59.74	59.74
PR	349	11.28	71.02
MG	244	7.88	78.90
SC	190	6.14	85.04
RJ	171	5.53	90.57
RS	129	4.17	94.73
GO	40	1.29	96.03
DF	30	0.97	97.00
ES	23	0.74	97.74
BA	19	0.61	98.35

product_id	product_category	name_length	description_length	photos_quantity	weight_g	length_cm	height_cm	width_cm
1e9e8ef04dbcff4541ed26657ea517e5	perfumaria	40	287	1	225	16	10	14
3aa071139cb16b67ca9e5dea641aaa2f	artes	44	276	1	1000	30	18	20
96bd76ec8810374ed1b65e291975717f	esporte_lazer	46	250	1	154	18	9	15
cef67bcfe19066a932b7673e239eb23d	bebes	27	261	1	371	26	4	26
9dc1a7de274444849c219cff195d0b71	utilidades_domesticas	37	402	4	625	20	17	13
41d3672d4792049fa1779bb35283ed13	instrumentos_musicais	60	745	1	200	38	5	11
732bd381ad09e530fe0a5f457d81becb	cool_stuff	56	1272	4	18350	70	24	44
2548af3e6e77a690cf3eb6368e9ab61e	moveis_decoracao	56	184	2	900	40	8	40
37cc742be07708b53a98702e77a21a02	eletrodomesticos	57	163	1	400	27	13	17
8c92109888e8cdf9d66dc7e463025574	brinquedos	36	1156	1	600	17	10	12

product_category_eng	no_products
bed_bath_table	3029
sports_leisure	2867
furniture_decor	2657
health_beauty	2444
housewares	2335
auto	1900
computers_accessories	1639
toys	1411
watches_gifts	1329
telephony	1134

variable_no	variable	n	mean	sample_stddev	max	min	mode	01_percentile	25_percentile	median	75_percentile	99_percentile
1	name_length	66	48.48	10.25	76	5	60	20	42	51	57	63
2	description_length	2960	771.50	635.12	3992	4	404	84	339	595	972	3287
3	photos_quantity	19	2.19	1.74	20	1	1	1	1	1	3	8
4	weight_g	2204	2276.47	4282.04	40425	0	200	60	300	700	1900	22550
5	length_cm	99	30.82	16.91	105	7	16	16	18	25	38	100
6	height_cm	102	16.94	13.64	105	2	10	2	8	13	21	69
7	width_cm	95	23.20	12.08	118	6	11	11	15	20	30	63

order_id	customer_id	status	purchase_timestamp	approval_timestamp	delivered_carrier_date	delivered_customer_date	estimated_delivery_date
e481f51cbdc54678b7cc49136f2d6af7	9ef432eb6251297304e76186b10a928d	delivered	2017-10-02 10:56:33	2017-10-02 11:07:15	2017-10-04 19:55:00	2017-10-10 21:25:13	2017-10-18 00:00:00
53cdb2fc8bc7dce0b6741e2150273451	b0830fb4747a6c6d20dea0b8c802d7ef	delivered	2018-07-24 20:41:37	2018-07-26 03:24:27	2018-07-26 14:31:00	2018-08-07 15:27:45	2018-08-13 00:00:00
47770eb9100c2d0c44946d9cf07ec65d	41ce2a54c0b03bf3443c3d931a367089	delivered	2018-08-08 08:38:49	2018-08-08 08:55:23	2018-08-08 13:50:00	2018-08-17 18:06:29	2018-09-04 00:00:00
949d5b44dbf5de918fe9c16f97b45f8a	f88197465ea7920adcdbec7375364d82	delivered	2017-11-18 19:28:06	2017-11-18 19:45:59	2017-11-22 13:39:59	2017-12-02 00:28:42	2017-12-15 00:00:00
ad21c59c0840e6cb83a9ceb5573f8159	8ab97904e6daea8866dbdbc4fb7aad2c	delivered	2018-02-13 21:18:39	2018-02-13 22:20:29	2018-02-14 19:46:34	2018-02-16 18:17:02	2018-02-26 00:00:00
a4591c265e18cb1dcee52889e2d8acc3	503740e9ca751ccdda7ba28e9ab8f608	delivered	2017-07-09 21:57:05	2017-07-09 22:10:13	2017-07-11 14:58:04	2017-07-26 10:57:55	2017-08-01 00:00:00
136cce7faa42fdb2cefd53fdc79a6098	ed0271e0b7da060a393796590e7b737a	invoiced	2017-04-11 12:22:08	2017-04-13 13:25:17	None	None	2017-05-09 00:00:00
6514b8ad8028c9f2cc2374ded245783f	9bdf08b4b3b52b5526ff42d37d47f222	delivered	2017-05-16 13:10:30	2017-05-16 13:22:11	2017-05-22 10:07:46	2017-05-26 12:55:51	2017-06-07 00:00:00
76c6e866289321a7c93b82b54852dc33	f54a9f0e6b351c431402b8461ea51999	delivered	2017-01-23 18:29:09	2017-01-25 02:50:47	2017-01-26 14:16:31	2017-02-02 14:08:10	2017-03-06 00:00:00
e69bfb5eb88e0ed6a785585b27e16dbf	31ad1d1b63eb9962463f764d4e6e0c9d	delivered	2017-07-29 11:55:02	2017-07-29 12:05:32	2017-08-10 19:45:24	2017-08-16 17:14:30	2017-08-23 00:00:00

status	status_count	status_rate
delivered	96478	97.02
shipped	1107	1.11
canceled	625	0.63
unavailable	609	0.61
invoiced	314	0.32
processing	301	0.30
created	5	0.01
approved	2	0.00

status	q3_2016	q4_2016	q1_2017	q2_2017	q3_2017	q4_2017	q1_2018	q2_2018	q3_2018	q4_2018
approved	2.78	0.21	None	None	None	None	None	None	None	None
canceled	5.56	5.03	1.01	0.67	0.59	0.42	0.63	0.29	1.10	100.00
created	11.11	0.21	None	None	None	None	None	None	None	None
delivered	2.78	55.77	94.55	95.42	95.78	97.63	97.74	98.42	98.06	None
invoiced	50.00	5.45	0.78	0.47	0.50	0.25	0.19	0.18	None	None
processing	5.56	13.42	0.86	0.54	0.63	0.25	0.07	0.01	None	None
shipped	2.78	1.68	1.57	1.60	1.06	0.92	1.25	0.98	0.85	None
unavailable	19.44	18.24	1.22	1.30	1.44	0.54	0.12	0.13	None	None

purchase_hour	count
0.0	2394
1.0	1170
2.0	510
3.0	272
4.0	206
5.0	188
6.0	502
7.0	1231
8.0	2967
9.0	4785
10.0	6177
11.0	6578
12.0	5995
13.0	6518
14.0	6569
15.0	6454
16.0	6675
17.0	6150
18.0	5769
19.0	5982
20.0	6193
21.0	6217
22.0	5816
23.0	4123

day_of_week	order_count
0_Sunday	11960
1_Monday	16196
2_Tuesday	15963
3_Wednesday	15552
4_Thursday	14761
5_Friday	14122
6_Saturday	10887

quarter_timestamp	no_purchases	avg_approval_time
2016 Q3	4	22 days, 6:44:13
2016 Q4	325	1 day, 12:16:54.169279
2017 Q1	5262	8:19:03.998283
2017 Q2	9349	10:40:32.145521
2017 Q3	12642	10:01:45.155857
2017 Q4	17848	10:31:47.305155
2018 Q1	21208	9:35:19.757736
2018 Q2	19979	10:58:46.115127
2018 Q3	12820	11:01:03.472983
2018 Q4	4	None

quarter_timestamp	no_purchases	average_carrier_delivery_time
2016 Q3	4	32 days, 2:26:01.500000
2016 Q4	325	13 days, 2:44:21.541219
2017 Q1	5262	3 days, 0:48:09.145506
2017 Q2	9349	2 days, 18:38:43.117911
2017 Q3	12642	2 days, 17:15:49.646044
2017 Q4	17848	3 days, 8:46:49.350054
2018 Q1	21208	3 days, 2:22:36.890171
2018 Q2	19979	2 days, 5:44:24.036689
2018 Q3	12820	2 days, 5:05:26.327469
2018 Q4	4	None

quarter_timestamp	no_purchases	average_customer_delivery_time
2016 Q3	4	1 day, 14:35:45
2016 Q4	325	5 days, 19:11:39.952030
2017 Q1	5262	9 days, 15:18:15.572439
2017 Q2	9349	9 days, 6:29:47.466326
2017 Q3	12642	8 days, 9:28:18.384395
2017 Q4	17848	10 days, 13:59:42.295561
2018 Q1	21208	12 days, 5:58:51.780104
2018 Q2	19979	8 days, 1:50:02.440615
2018 Q3	12820	5 days, 16:02:36.837051
2018 Q4	4	None

delivery_hour	count
0.0	2885
1.0	1515
2.0	649
3.0	260
4.0	187
5.0	198
6.0	269
7.0	396
8.0	779
9.0	1196
10.0	1799
11.0	2579
12.0	3651
13.0	4561
14.0	5646
15.0	6741
16.0	7901
17.0	8775
18.0	9640
19.0	9484
20.0	9157
21.0	7627
22.0	6143
23.0	4438

day_of_week	delivery_count
0_Sunday	1207
1_Monday	18892
2_Tuesday	18493
3_Wednesday	18059
4_Thursday	17097
5_Friday	17265
6_Saturday	8428

order_id	item_id	product_id	seller_id	shipping_limit_date	price	freight_value
0812eb902a67711a1cb742b3cdaa65ae	1	489ae2aa008f021502940f251d4cce7f	e3b4998c7a498169dc7bce44e6bb6277	2017-02-16 20:37:36	6735.00	194.31
fefacc66af859508bf1a7934eab1e97f	1	69c590f7ffc7bf8db97190b6cb6ed62e	80ceebb4ee9b31afb6c6a916a574a1e2	2018-08-02 04:05:13	6729.00	193.21
f5136e38d1a14a4dbd87dff67da82701	1	1bdf5e6731585cf01aa8169c7028d6ad	ee27a8f15b1dded4d213a468ba4eb391	2017-06-15 02:45:17	6499.00	227.66
a96610ab360d42a2e5335a3998b4718a	1	a6492cc69376c469ab6f61d8f44de961	59417c56835dd8e2e72f91f809cd4092	2017-04-18 13:25:18	4799.00	151.34
199af31afc78c699f0dbf71fb178d4d4	1	c3ed642d592594bb648ff4a04cee2747	59417c56835dd8e2e72f91f809cd4092	2017-05-09 15:50:15	4690.00	74.34
8dbc85d1447242f3b127dda390d56e19	1	259037a6a41845e455183f89c5035f18	c72de06d72748d1a0dfb2125be43ba63	2018-06-28 12:36:36	4590.00	91.78
426a9742b533fc6fed17d1fd6d143d7e	1	a1beef8f3992dbd4cd8726796aa69c53	512d298ac2a96d1931b6bd30aa21f61d	2018-08-16 14:24:28	4399.87	113.45
68101694e5c5dc7330c91e1bbc36214f	1	6cdf8fc1d741c76586d8b6b15e9eef30	ed4acab38528488b65a9a9c603ff024a	2018-04-05 08:27:27	4099.99	75.27
b239ca7cd485940b31882363b52e6674	1	dd113cb02b2af9c8e5787e8f1f0722f6	821fb029fc6e495ca4f08a35d51e53a5	2018-08-02 08:15:14	4059.00	104.51
86c4eab1571921a6a6e248ed312f5a5a	1	6902c1962dd19d540807d0ab8fade5c6	fa1c13f2614d7b5c4749cbc52fecda94	2017-03-23 20:08:04	3999.90	17.01

seller_id	product_count	percentage_product_count	accumulated_percentage_product_count
6560211a19b47992c3666cc44a7e94c0	2033	1.80	1.80
4a3ca9315b744ce9f8e9374361493884	1987	1.76	3.57
1f50f920176fa81dab994f9023523100	1931	1.71	5.28
cc419e0650a3c5ba77189a1882b7556a	1775	1.58	6.86
da8622b14eb17ae2831f4ac5b9dab84a	1551	1.38	8.24
955fee9216a65b617aa5c0531780ce60	1499	1.33	9.57
1025f0e2d44d7041d6cf58b6550e0bfa	1428	1.27	10.83
7c67e1448b00f6e969d365cea6b010ab	1364	1.21	12.04
ea8482cd71df3c1969d7b9473ff13abc	1203	1.07	13.11
7a67c85e85bb2ce8582c35f2203ad736	1171	1.04	14.15
4869f7a5dfa277a7dca6462dcf3b52b2	1156	1.03	15.18
3d871de0142ce09b7081e2b9d1733cb1	1147	1.02	16.20
8b321bb669392f5163d04c59e235e066	1018	0.90	17.10
cca3071e3e9bb7d12640c9fbe2301306	830	0.74	17.84
620c87c171fb2a6dd6e8bb4dec959fc6	798	0.71	18.55
a1043bafd471dff536d0c462352beb48	770	0.68	19.23
e9779976487b77c6d4ac45f75ec7afe9	750	0.67	19.89
f8db351d8c4c4c22c6835c19a46f01b0	724	0.64	20.54
d2374cbcbb3ca4ab1086534108cc3ab7	631	0.56	21.10
391fc6631aebcf3004804e51b40bcf1e	613	0.54	21.64
fa1c13f2614d7b5c4749cbc52fecda94	586	0.52	22.16
7d13fca15225358621be4086e1eb0964	578	0.51	22.67
128639473a139ac0f3e5f5ade55873a5	560	0.50	23.17
1835b56ce799e6a4dc4eddc053f04066	558	0.50	23.67
46dc3b2cc0980fb8ec44634e21d2718e	542	0.48	24.15
d91fb3b7d041e83b64a00a3edfb37e4f	536	0.48	24.62
1900267e848ceeba8fa32d80c1a5f5a8	529	0.47	25.09
85d9eb9ddc5d00ca9336a2219c97bb13	522	0.46	26.02
53e4c6e0f4312d4d2107a8c9cddf45cd	522	0.46	26.02
4e922959ae960d389249c378d1c939f5	454	0.40	26.42

seller_id	total_sales_value	revenue_share	accumulated_revenue_share
4869f7a5dfa277a7dca6462dcf3b52b2	229472.63	1.69	1.69
53243585a1d6dc2643021fd1853d8905	222776.05	1.64	3.33
4a3ca9315b744ce9f8e9374361493884	200472.92	1.47	4.80
fa1c13f2614d7b5c4749cbc52fecda94	194042.03	1.43	6.23
7c67e1448b00f6e969d365cea6b010ab	187923.89	1.38	7.61
7e93a43ef30c4f03f38b393420bc753a	176431.87	1.30	8.91
da8622b14eb17ae2831f4ac5b9dab84a	160236.57	1.18	10.09
7a67c85e85bb2ce8582c35f2203ad736	141745.53	1.04	11.13
1025f0e2d44d7041d6cf58b6550e0bfa	138968.55	1.02	12.16
955fee9216a65b617aa5c0531780ce60	135171.70	0.99	13.15
46dc3b2cc0980fb8ec44634e21d2718e	128111.19	0.94	14.09
6560211a19b47992c3666cc44a7e94c0	123304.83	0.91	15.00
620c87c171fb2a6dd6e8bb4dec959fc6	114774.50	0.84	15.84
7d13fca15225358621be4086e1eb0964	113628.97	0.84	16.68
5dceca129747e92ff8ef7a997dc4f8ca	112155.53	0.83	17.50
1f50f920176fa81dab994f9023523100	106939.21	0.79	18.29
cc419e0650a3c5ba77189a1882b7556a	104288.42	0.77	19.06
a1043bafd471dff536d0c462352beb48	101901.16	0.75	19.81
3d871de0142ce09b7081e2b9d1733cb1	94914.20	0.70	20.51
edb1ef5e36e0c8cd84eb3c9b003e486d	79284.55	0.58	21.09
ccc4bbb5f32a6ab2b7066a4130f114e3	74004.62	0.54	21.63
f7ba60f8c3f99e7ee4042fdef03b70c4	68395.00	0.50	22.14
fe2032dab1a61af8794248c8196565c9	65959.61	0.49	22.62
8581055ce74af1daba164fdbd55a40de	64925.30	0.48	23.10
cca3071e3e9bb7d12640c9fbe2301306	64009.89	0.47	23.57
04308b1ee57b6625f47df1d56f00eedf	60130.60	0.44	24.01
522620dcb18a6b31cd7bdf73665113a9	57168.49	0.42	24.44
17e34d8224d27a541263c4c64b11a56b	56363.24	0.41	24.85
7ddcbb64b5bc1ef36ca8c151f6ec77df	55578.57	0.41	25.26
de722cd6dad950a92b7d4f82673f8833	55426.10	0.41	25.67

variable_no	variable	n	mean	sample_stddev	max	min	mode	01_percentile	25_percentile	median	75_percentile	99_percentile
1	price	5968	120.65	183.63	6735.00	0.85	59.90	9.99	39.9	74.99	134.9	890.0
2	freight_value	6999	19.99	15.81	409.68	0.00	15.10	4.4198	13.08	16.26	21.15	84.52

order_id	payment_sequential	payment_type	payment_installments	payment_value
b81ef226f3fe1789b1e8b2acac839d17	1	credit_card	8	99.33
a9810da82917af2d9aefd1278f1dcfa0	1	credit_card	1	24.39
25e8ea4e93396b6fa0d3dd708e76c1bd	1	credit_card	1	65.71
ba78997921bbcdc1373bb41e913ab953	1	credit_card	8	107.78
42fdf880ba16b47b59251dd489d4441a	1	credit_card	2	128.45
298fcdf1f73eb413e4d26d01b25bc1cd	1	credit_card	2	96.12
771ee386b001f06208a7419e4fc1bbd7	1	credit_card	1	81.16
3d7239c394a212faae122962df514ac7	1	credit_card	3	51.84
1f78449c87a54faf9e96e88ba1491fa9	1	credit_card	6	341.09
0573b5e23cbd798006520e1d5b4c6714	1	boleto	1	51.95

payment_type	payment_count	share_of_payment_type	accumulated_share
credit_card	76795	73.92	73.92
boleto	19784	19.04	92.97
voucher	5775	5.56	98.53
debit_card	1529	1.47	100.00
not_defined	3	0.00	100.00

variable_no	variable	n	mean	sample_stddev	max	min	mode	01_percentile	25_percentile	median	75_percentile	99_percentile
1	payment_sequential	29	1.09	0.71	29	1	1	1.0	1.0	1.0	1.0	3.0
2	payment_installments	24	2.85	2.69	24	0	1	1.0	1.0	1.0	4.0	10.0
3	payment_value	29077	154.10	217.49	13664.08	0.00	50.00	6.69	56.79	100.0	171.8375	1039.9164999999994

Brazilian E-Commerce Public Dataset by Olist¶

Objectives of this project¶

Exploratory Analysis¶

Explore the table category_names_english¶

Explore the table customers¶

Counting the number of customers, cities, and states¶

Top ten cities having the most customers¶

Cities having at least 500 customers¶

Ten states having the most customers¶

Customer shares of the top ten cities¶

Customer shares of the top ten states¶

Conclusions¶

Explore the table sellers¶

Counting the number of sellers, cities, and states¶

Top ten cities having the most sellers¶

Top ten states having the most sellers¶

Seller shares of the top ten cities¶

Seller shares of the top ten states¶

Conclusions¶

Explore the table products¶

Product and category counts¶

Top ten categories having highest product counts¶

Descriptive statistics¶

Conclusions¶

Explore the table orders¶

Counting tables¶

Delivery rate¶

Changes of the delivery rate over time¶

Daily frequent purchasing period¶

Weekly frequent purchasing period¶

Average approval time¶

Changes in the average approval time¶

Average carrier delivery time since approval¶

Changes in the average carrier delivery duration¶

Average customer delivery time¶

changes in the average customer delivery time¶

Common delivery time during the day¶

Common delivery day during the week¶

Accuracy of the estimated delivery date¶

Change in the accuracy of the estimated delivery date¶

Conclusions¶

Explore the table order_items¶

product-order ratio¶

Top 1 percent sellers¶

Descriptive statistics¶

Conclusions¶

Explore the table order_payments¶

Shares of payment types¶

Descriptive statistics¶

Value of the top one percent orders¶

Conclusions¶

Explore the table order_reviews¶

Descriptive statistics of rating¶

Average answering time¶

Changes of answering time¶

Executive summary¶

Explore the table `category_names_english`¶

Explore the table `customers`¶

Explore the table `sellers`¶

Explore the table `products`¶

Explore the table `orders`¶

Explore the table `order_items`¶

Explore the table `order_payments`¶

Explore the table `order_reviews`¶

row_number	order_id	order_value	order_value_share	accumulated_share
1	03caa2c082116e1d31e67e9ae3700499	13664.08	0.0854	0.0854
2	736e1922ae60d0d6a89247b851902527	7274.88	0.0454	0.1308
3	0812eb902a67711a1cb742b3cdaa65ae	6929.31	0.0433	0.1741
4	fefacc66af859508bf1a7934eab1e97f	6922.21	0.0432	0.2173
5	f5136e38d1a14a4dbd87dff67da82701	6726.66	0.0420	0.2593
6	2cc9089445046817a7539d90805e6e5a	6081.54	0.0380	0.2973
7	a96610ab360d42a2e5335a3998b4718a	4950.34	0.0309	0.3282
8	b4c4b76c642808cbe472a32b86cddc95	4809.44	0.0300	0.3583
9	199af31afc78c699f0dbf71fb178d4d4	4764.34	0.0298	0.3881
10	8dbc85d1447242f3b127dda390d56e19	4681.78	0.0292	0.4173

review_id	order_id	rating	review_title	review_content	creation_timestamp	answer_timestamp
7bc2406110b926393aa56f80a40eba40	73fc7af87114b39712e6da79b0a377eb	4	None	None	2018-01-18 00:00:00	2018-01-18 21:46:59
80e641a11e56f04c1ad469d5645fdfde	a548910a1c6147796b98fdf73dbeba33	5	None	None	2018-03-10 00:00:00	2018-03-11 03:05:13
228ce5500dc1d8e020d8d1322874b6f0	f9e4b658b201a9f2ecdecbb34bed034b	5	None	None	2018-02-17 00:00:00	2018-02-18 14:36:24
e64fb393e7b32834bb789ff8bb30750e	658677c97b385a9be170737859d3511b	5	None	Recebi bem antes do prazo estipulado.	2017-04-21 00:00:00	2017-04-21 22:02:06
f7c4243c7fe1938f181bec41a392bdeb	8e6bfb81e283fa7e4f11123a3fb894f1	5	None	Parabéns lojas lannister adorei comprar pela Internet seguro e prático Parabéns a todos feliz Páscoa	2018-03-01 00:00:00	2018-03-02 10:26:53
15197aa66ff4d0650b5434f1b46cda19	b18dcdf73be66366873cd26c5724d1dc	1	None	None	2018-04-13 00:00:00	2018-04-16 00:39:37
07f9bee5d1b850860defd761afa7ff16	e48aa0d2dcec3a2e87348811bcfdf22b	5	None	None	2017-07-16 00:00:00	2017-07-18 19:30:34
7c6400515c67679fbee952a7525281ef	c31a859e34e3adac22f376954e19b39d	5	None	None	2018-08-14 00:00:00	2018-08-14 21:36:06
a3f6f7f6f433de0aefbb97da197c554c	9c214ac970e84273583ab523dfafd09b	5	None	None	2017-05-17 00:00:00	2017-05-18 12:05:37
8670d52e15e00043ae7de4c01cc2fe06	b9bf720beb4ab3728760088589c62129	4	recomendo	aparelho eficiente. no site a marca do aparelho esta impresso como 3desinfector e ao chegar esta com outro nome...atualizar com a marca correta uma vez que é o mesmo aparelho	2018-05-22 00:00:00	2018-05-23 16:45:47

quarter_timestamp	no_observations	average_answering_time
2016 Q4	325	8 days, 19:56:34.790769
2017 Q1	4133	3 days, 20:47:04.692959
2017 Q2	9205	4 days, 0:53:09.817925
2017 Q3	12205	3 days, 6:30:36.288980
2017 Q4	17192	3 days, 2:11:31.840566
2018 Q1	20083	3 days, 3:04:36.774734
2018 Q2	21460	2 days, 21:20:07.352330
2018 Q3	14621	2 days, 15:17:10.215717