On March 16, 2022, Tal Mizrachi (The amazing Analysis Paralysis) & Amit Levinson live streamed an R vs/and python data analysis session.
You can find below the questions along with solutions in each programming language. We used the dogs of Zurich Kaggle dataset, cleaned it a bit and added a random DOB column. The files are available for download in the button below, or in Amit’s GitHub Along with the raw files.
Follow along with the full recording
Solved it? Enjoyed? Have any feedback? Let us know! We’d love to hear!
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
library(purrr)
library(dplyr)
library(readr)
library(tidyr)
library(ggplot2)
library(tidytext)
library(reticulate)
= []
all_files for file in os.listdir("split_data/"):
if not file.endswith("csv"):
continue
"split_data/"+file))
all_files.append(pd.read_csv(
= pd.concat(all_files, ignore_index=True)
df
df
## owner_id age gender ... dog_gender dog_color dog_dob
## 0 124913 41-50 w ... m saufarben 1996-11-10
## 1 84494 61-70 w ... w schwarz/rot 1997-08-10
## 2 85293 81-90 w ... w beige/weiss 1997-09-10
## 3 80204 51-60 w ... m dreifarbig 1998-08-10
## 4 80399 61-70 w ... m tricolor 1998-08-10
## ... ... ... ... ... ... ... ...
## 7150 135725 31-40 w ... w gelb/weiss 2016-03-10
## 7151 135726 11-20 w ... w schwarz 2016-09-10
## 7152 135728 31-40 w ... w vierfarbig 2016-02-10
## 7153 135731 21-30 m ... m schwarz 2016-12-10
## 7154 67272 31-40 w ... w tricolor 2017-05-10
##
## [7155 rows x 14 columns]
<- paste0("split_data/",list.files(path = "split_data/"))
files <- map_dfr(files, read_csv, col_types = cols())
dogs
dogs
## # A tibble: 7,155 × 14
## owner_id age gender city_district quarter primary_breed is_hybrid
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 124913 41-50 w 11 115 Spitz Mischling
## 2 84494 61-70 w 10 102 Dachshund <NA>
## 3 85293 81-90 w 11 115 Malteser Mischling
## 4 80204 51-60 w 11 111 Appenzeller <NA>
## 5 80399 61-70 w 12 123 Appenzeller Mischling
## 6 82293 71-80 w 2 24 Mischling klein <NA>
## 7 82452 61-70 w 10 102 Dachshund <NA>
## 8 82452 61-70 w 10 102 Dachshund <NA>
## 9 83136 71-80 m 3 31 Mischling klein <NA>
## 10 83476 61-70 w 11 119 West Highland White Te… <NA>
## # … with 7,145 more rows, and 7 more variables: secondary_breed <chr>,
## # is_secondary_hybrid <lgl>, breed_type <chr>, dog_yob <dbl>,
## # dog_gender <chr>, dog_color <chr>, dog_dob <date>
=[0.2,0.8]) df.describe(percentiles
## owner_id city_district ... is_secondary_hybrid dog_yob
## count 7155.000000 7154.000000 ... 0.0 7155.000000
## mean 105854.243326 7.413195 ... NaN 2009.803215
## std 21260.952718 3.261187 ... NaN 4.115384
## min 126.000000 1.000000 ... NaN 1996.000000
## 20% 87256.000000 4.000000 ... NaN 2006.000000
## 50% 105784.000000 8.000000 ... NaN 2010.000000
## 80% 126879.800000 11.000000 ... NaN 2014.000000
## max 135731.000000 12.000000 ... NaN 2017.000000
##
## [8 rows x 5 columns]
::skim(dogs) skimr
Name | dogs |
Number of rows | 7155 |
Number of columns | 14 |
_______________________ | |
Column type frequency: | |
character | 8 |
Date | 1 |
logical | 1 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
age | 1 | 1.00 | 5 | 6 | 0 | 9 | 0 |
gender | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
primary_breed | 0 | 1.00 | 3 | 34 | 0 | 304 | 0 |
is_hybrid | 6537 | 0.09 | 9 | 9 | 0 | 1 | 0 |
secondary_breed | 6612 | 0.08 | 4 | 30 | 0 | 118 | 0 |
breed_type | 0 | 1.00 | 1 | 2 | 0 | 3 | 0 |
dog_gender | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
dog_color | 0 | 1.00 | 3 | 28 | 0 | 169 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
dog_dob | 0 | 1 | 1996-11-10 | 2017-05-10 | 2010-08-10 | 226 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
is_secondary_hybrid | 7155 | 0 | NaN | : |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
owner_id | 0 | 1 | 105854.24 | 21260.95 | 126 | 88587.5 | 105784 | 124634.5 | 135731 | ▁▁▁▇▇ |
city_district | 1 | 1 | 7.41 | 3.26 | 1 | 5.0 | 8 | 10.0 | 12 | ▅▂▅▅▇ |
quarter | 1 | 1 | 76.70 | 33.03 | 4 | 51.0 | 81 | 102.0 | 123 | ▂▃▆▅▇ |
dog_yob | 0 | 1 | 2009.80 | 4.12 | 1996 | 2007.0 | 2010 | 2013.0 | 2017 | ▁▃▆▇▇ |
= df['dog_color'].str.split("/", expand=True).melt().copy()
melted
'value'].value_counts() melted[
## schwarz 2349
## weiss 2207
## braun 1836
## tricolor 696
## beige 622
## ...
## bicolor 1
## schwarz melliert 1
## hirschrot mit Maske 1
## marronschimmel 1
## rotblond 1
## Name: value, Length: 89, dtype: int64
%>%
dogs transmute(
new_color = strsplit(dog_color, split = "/")
%>%
) unnest() %>%
count(new_color, sort = T)
## # A tibble: 89 × 2
## new_color n
## <chr> <int>
## 1 schwarz 2349
## 2 weiss 2207
## 3 braun 1836
## 4 tricolor 696
## 5 beige 622
## 6 rot 368
## 7 grau 282
## 8 tan 228
## 9 hellbraun 170
## 10 black 160
## # … with 79 more rows
= df[['dog_gender','primary_breed']].pivot_table(index='primary_breed', columns='dog_gender', aggfunc=len).copy()
main_table
= main_table.sort_values('w', ascending=False).head(10).copy()
sorted_by_w = main_table.sort_values('m', ascending=False).head(10).copy()
sorted_by_m
= plt.subplots(1,2)
fig, axes
"This is a title", fontsize=18)
fig.suptitle(
='bar', rot=40, color=["pink", "teal"], ax=axes[0], figsize=(20,5))
sorted_by_w.plot(kind='bar', rot=40, color=["pink", "teal"], ax=axes[1], figsize=(20,5)) sorted_by_m.plot(kind
%>%
dogs count(dog_gender, primary_breed, sort = T) %>%
group_by(dog_gender) %>%
slice(1:10) %>%
ggplot(aes(y = reorder_within(primary_breed, by = n, within = dog_gender), x = n)) +
geom_col() +
facet_wrap(vars(dog_gender), scales = "free_y") +
scale_y_reordered() +
labs(y = NULL, title = "This is a title")+
theme_minimal()
'dog_dob'] = pd.to_datetime(df['dog_dob'])
df[= df[['owner_id', 'dog_dob']].groupby('owner_id').agg([min, max])
gb
= ["_".join(list(x)) for x in gb.columns]
gb.columns
'diff'] = (gb['dog_dob_max']-gb['dog_dob_min']).dt.days
gb[
'diff').tail(1) gb.sort_values(
## dog_dob_min dog_dob_max diff
## owner_id
## 87158 2000-04-10 2015-10-10 5661
%>%
dogs group_by(owner_id) %>%
filter(
n() > 1,
== min(dog_dob) | dog_dob == max(dog_dob)
dog_dob %>%
) select(owner_id, dog_dob) %>%
arrange(owner_id, dog_dob) %>%
summarise(
timediff = difftime(dog_dob, time2 = lag(dog_dob), units = "days")
%>%
) fill(timediff, .direction = "up") %>%
arrange(-timediff)
## # A tibble: 1,153 × 2
## # Groups: owner_id [576]
## owner_id timediff
## <dbl> <drtn>
## 1 87158 5661 days
## 2 87158 5661 days
## 3 88250 5265 days
## 4 88250 5265 days
## 5 100112 5234 days
## 6 100112 5234 days
## 7 109472 4932 days
## 8 109472 4932 days
## 9 128461 4932 days
## 10 128461 4932 days
## # … with 1,143 more rows
Another solution suggested by Adi Sarid
%>%
dogs group_by(owner_id) %>%
summarise(age_gap = max(dog_dob) - min(dog_dob)) %>%
arrange(desc(age_gap))
= pd.ExcelWriter("dogs_by_owner_age_python.xlsx")
writer
for age in df['age'].fillna("None").unique():
= df[df['age']==age].copy()
temp_df =age)
temp_df.to_excel(writer, sheet_name
writer.save()
%>%
dogs split(.$age) %>%
::write_xlsx(path = "dogs_by_owners_age.xlsx") writexl