Most of the material (e.g., the picture above) is borrowed from R for data science
dplyr
dplyr
package provides simple “verbs”, functions that correspond to the most common data manipulation tasks.library(tidyverse)
filter()
to select observations based on their values.arrange()
to reorder the observations.select()
to select variables based on their names.mutate()
to add variables as functions of existing variables.summarize()
to collapse many values down to a single summary.group_by()
to operate groupwise.nycflights13
All 336,776 flights that departed from NYC in 2013
print(nycflights13::flights)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables:
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
filter()
print(filter(flights, month == 1, day == 1))
## # A tibble: 842 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 832 more rows, and 12 more variables:
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
>, >=, <, <=, !=
, and ==
.print(filter(flights, month = 1))
Multiple arguments to filter()
are combined with:
&
for “and”|
for “or”!
for “not”print(filter(flights, month == 11 | month == 12))
## # A tibble: 55,403 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 11 1 5 2359 6 352
## 2 2013 11 1 35 2250 105 123
## 3 2013 11 1 455 500 -5 641
## 4 2013 11 1 539 545 -6 856
## 5 2013 11 1 542 545 -3 831
## 6 2013 11 1 549 600 -11 912
## 7 2013 11 1 550 600 -10 705
## 8 2013 11 1 554 600 -6 659
## 9 2013 11 1 554 600 -6 826
## 10 2013 11 1 554 600 -6 749
## # ... with 55,393 more rows, and 12 more variables:
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
filter()
df <- tibble(x = c(1, NA, 3))
print(filter(df, x > 1))
## # A tibble: 1 x 1
## x
## <dbl>
## 1 3
print(filter(df, is.na(x) | x > 1))
## # A tibble: 2 x 1
## x
## <dbl>
## 1 NA
## 2 3
arrange()
print(arrange(flights, year, month, day))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables:
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
arrange()
and desc()
print(arrange(flights, desc(arr_delay)))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 6 15 1432 1935 1137 1607
## 3 2013 1 10 1121 1635 1126 1239
## 4 2013 9 20 1139 1845 1014 1457
## 5 2013 7 22 845 1600 1005 1044
## 6 2013 4 10 1100 1900 960 1342
## 7 2013 3 17 2321 810 911 135
## 8 2013 7 22 2257 759 898 121
## 9 2013 12 5 756 1700 896 1058
## 10 2013 5 3 1133 2055 878 1250
## # ... with 336,766 more rows, and 12 more variables:
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
arrange()
and missing valuesdf <- tibble(x = c(5, NA, 2))
print(arrange(df, x))
## # A tibble: 3 x 1
## x
## <dbl>
## 1 2
## 2 5
## 3 NA
print(arrange(df, desc(x)))
## # A tibble: 3 x 1
## x
## <dbl>
## 1 5
## 2 2
## 3 NA
select()
print(select(flights, year, month, day))
## # A tibble: 336,776 x 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ... with 336,766 more rows
year
and day
print(select(flights, year:day))
## # A tibble: 336,776 x 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ... with 336,766 more rows
year
to day
print(select(flights, -(year:day)))
## # A tibble: 336,776 x 16
## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
## <int> <int> <dbl> <int> <int> <dbl>
## 1 517 515 2 830 819 11
## 2 533 529 4 850 830 20
## 3 542 540 2 923 850 33
## 4 544 545 -1 1004 1022 -18
## 5 554 600 -6 812 837 -25
## 6 554 558 -4 740 728 12
## 7 555 600 -5 913 854 19
## 8 557 600 -3 709 723 -14
## 9 557 600 -3 838 846 -8
## 10 558 600 -2 753 745 8
## # ... with 336,766 more rows, and 10 more variables: carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
select()
and everything()
print(select(flights, time_hour, air_time, everything()))
## # A tibble: 336,776 x 19
## time_hour air_time year month day dep_time
## <dttm> <dbl> <int> <int> <int> <int>
## 1 2013-01-01 05:00:00 227 2013 1 1 517
## 2 2013-01-01 05:00:00 227 2013 1 1 533
## 3 2013-01-01 05:00:00 160 2013 1 1 542
## 4 2013-01-01 05:00:00 183 2013 1 1 544
## 5 2013-01-01 06:00:00 116 2013 1 1 554
## 6 2013-01-01 05:00:00 150 2013 1 1 554
## 7 2013-01-01 06:00:00 158 2013 1 1 555
## 8 2013-01-01 06:00:00 53 2013 1 1 557
## 9 2013-01-01 06:00:00 140 2013 1 1 557
## 10 2013-01-01 06:00:00 138 2013 1 1 558
## # ... with 336,766 more rows, and 13 more variables:
## # sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # distance <dbl>, hour <dbl>, minute <dbl>
select()
select()
:
starts_with("abc")
: matches names that begin with “abc”.ends_with("xyz")
: matches names that end with “xyz”.contains("ijk")
: matches names that contain “ijk”.matches("(.)\\1")
: selects variables that match a regular expression (this one matches any variables that contain repeated characters).num_range("x", 1:3)
matches x1
, x2
and x3
. ?select
for more details.print(select(flights, contains("arr")))
## # A tibble: 336,776 x 4
## arr_time sched_arr_time arr_delay carrier
## <int> <int> <dbl> <chr>
## 1 830 819 11 UA
## 2 850 830 20 UA
## 3 923 850 33 AA
## 4 1004 1022 -18 B6
## 5 812 837 -25 DL
## 6 740 728 12 UA
## 7 913 854 19 B6
## 8 709 723 -14 EV
## 9 838 846 -8 B6
## 10 753 745 8 AA
## # ... with 336,766 more rows
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time)
print(flights_sml)
## # A tibble: 336,776 x 7
## year month day dep_delay arr_delay distance air_time
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227
## 2 2013 1 1 4 20 1416 227
## 3 2013 1 1 2 33 1089 160
## 4 2013 1 1 -1 -18 1576 183
## 5 2013 1 1 -6 -25 762 116
## 6 2013 1 1 -4 12 719 150
## 7 2013 1 1 -5 19 1065 158
## 8 2013 1 1 -3 -14 229 53
## 9 2013 1 1 -3 -8 944 140
## 10 2013 1 1 -2 8 733 138
## # ... with 336,766 more rows
mutate()
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time)
print(mutate(flights_sml,
gain = arr_delay - dep_delay,
speed = distance / air_time * 60))
## # A tibble: 336,776 x 9
## year month day dep_delay arr_delay distance air_time gain speed
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 9 370.
## 2 2013 1 1 4 20 1416 227 16 374.
## 3 2013 1 1 2 33 1089 160 31 408.
## 4 2013 1 1 -1 -18 1576 183 -17 517.
## 5 2013 1 1 -6 -25 762 116 -19 394.
## 6 2013 1 1 -4 12 719 150 16 288.
## 7 2013 1 1 -5 19 1065 158 24 404.
## 8 2013 1 1 -3 -14 229 53 -11 259.
## 9 2013 1 1 -3 -8 944 140 -5 405.
## 10 2013 1 1 -2 8 733 138 10 319.
## # ... with 336,766 more rows
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time)
print(mutate(flights_sml,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours))
## # A tibble: 336,776 x 10
## year month day dep_delay arr_delay distance air_time gain hours
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 9 3.78
## 2 2013 1 1 4 20 1416 227 16 3.78
## 3 2013 1 1 2 33 1089 160 31 2.67
## 4 2013 1 1 -1 -18 1576 183 -17 3.05
## 5 2013 1 1 -6 -25 762 116 -19 1.93
## 6 2013 1 1 -4 12 719 150 16 2.5
## 7 2013 1 1 -5 19 1065 158 24 2.63
## 8 2013 1 1 -3 -14 229 53 -11 0.883
## 9 2013 1 1 -3 -8 944 140 -5 2.33
## 10 2013 1 1 -2 8 733 138 10 2.3
## # ... with 336,766 more rows, and 1 more variable:
## # gain_per_hour <dbl>
transmute()
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time)
print(transmute(flights_sml,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours))
## # A tibble: 336,776 x 3
## gain hours gain_per_hour
## <dbl> <dbl> <dbl>
## 1 9 3.78 2.38
## 2 16 3.78 4.23
## 3 31 2.67 11.6
## 4 -17 3.05 -5.57
## 5 -19 1.93 -9.83
## 6 16 2.5 6.4
## 7 24 2.63 9.11
## 8 -11 0.883 -12.5
## 9 -5 2.33 -2.14
## 10 10 2.3 4.35
## # ... with 336,766 more rows
Any vectorized function would work, but frequently useful are:
+, -, *, /, ˆ
.
air_time / 60
).x / sum(x)
or y - mean(y)
).%/%
(integer division) and %%
(remainder), where x == y * (x %/% y) + (x %% y)
.
hour = dep_time %/% 100
and minute = dep_time %% 100
)log(), log2(), log10()
.
lead()
and lag()
:
x - lag(x)
or find values change x != lag(x)
).x <- 1:10
lag(x)
## [1] NA 1 2 3 4 5 6 7 8 9
lead(x)
## [1] 2 3 4 5 6 7 8 9 10 NA
cumsum(), cumprod(), cummin(), cummax(), cummean()
.<, <=, >, >=, !=
min_rank(), row_number(), dense_rank(), percent_rank(), cume_dist(), ntile()
y <- c(1, 2, 2, NA, 3, 4)
min_rank(y)
## [1] 1 2 2 NA 4 5
min_rank(desc(y))
## [1] 5 3 3 NA 2 1
row_number(y)
## [1] 1 2 3 NA 4 5
dense_rank(y)
## [1] 1 2 2 NA 3 4
percent_rank(y)
## [1] 0.00 0.25 0.25 NA 0.75 1.00
cume_dist(y)
## [1] 0.2 0.6 0.6 NA 0.8 1.0
summarize()
print(summarize(flights, delay = mean(dep_delay, na.rm = TRUE)))
## # A tibble: 1 x 1
## delay
## <dbl>
## 1 12.6
summarize()
paired with group_by()
by_day <- group_by(flights, year, month, day)
print(summarize(by_day, delay = mean(dep_delay, na.rm = TRUE)))
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ... with 355 more rows
a1 <- group_by(flights, year, month, day)
a2 <- select(a1, arr_delay, dep_delay)
## Adding missing grouping variables: `year`, `month`, `day`
a3 <- summarize(a2,
arr = mean(arr_delay, na.rm = TRUE),
dep = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
print(filter(a3, arr > 30 | dep > 30))
## # A tibble: 49 x 5
## # Groups: year, month [11]
## year month day arr dep
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 16 34.2 24.6
## 2 2013 1 31 32.6 28.7
## 3 2013 2 11 36.3 39.1
## 4 2013 2 27 31.3 37.8
## 5 2013 3 8 85.9 83.5
## 6 2013 3 18 41.3 30.1
## 7 2013 4 10 38.4 33.0
## 8 2013 4 12 36.0 34.8
## 9 2013 4 18 36.0 34.9
## 10 2013 4 19 47.9 46.1
## # ... with 39 more rows
print(filter(summarize(select(group_by(flights, year, month, day),
arr_delay, dep_delay),
arr = mean(arr_delay, na.rm = TRUE),
dep = mean(dep_delay, na.rm = TRUE)),
arr > 30 | dep > 30))
## # A tibble: 49 x 5
## # Groups: year, month [11]
## year month day arr dep
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 16 34.2 24.6
## 2 2013 1 31 32.6 28.7
## 3 2013 2 11 36.3 39.1
## 4 2013 2 27 31.3 37.8
## 5 2013 3 8 85.9 83.5
## 6 2013 3 18 41.3 30.1
## 7 2013 4 10 38.4 33.0
## 8 2013 4 12 36.0 34.8
## 9 2013 4 18 36.0 34.9
## 10 2013 4 19 47.9 46.1
## # ... with 39 more rows
%>%
flights %>%
group_by(year, month, day) %>%
select(arr_delay, dep_delay) %>%
summarize(arr = mean(arr_delay, na.rm = TRUE),
dep = mean(dep_delay, na.rm = TRUE)) %>%
filter(arr > 30 | dep > 30) %>%
print()
## # A tibble: 49 x 5
## # Groups: year, month [11]
## year month day arr dep
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 16 34.2 24.6
## 2 2013 1 31 32.6 28.7
## 3 2013 2 11 36.3 39.1
## 4 2013 2 27 31.3 37.8
## 5 2013 3 8 85.9 83.5
## 6 2013 3 18 41.3 30.1
## 7 2013 4 10 38.4 33.0
## 8 2013 4 12 36.0 34.8
## 9 2013 4 18 36.0 34.9
## 10 2013 4 19 47.9 46.1
## # ... with 39 more rows
mean(), median()
.sd(), IQR(), mad()
.min(x), quantile(x, 0.25), max(x)
.first(x), nth(x, 2), last(x)
.n(x), sum(!is.na(x)), n_distinct(x)
.not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
group_by(dest) %>%
summarize(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers)) %>%
print()
## # A tibble: 104 x 2
## dest carriers
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6
## # ... with 94 more rows
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
count(dest) %>%
print()
## # A tibble: 104 x 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # ... with 94 more rows
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
count(tailnum, wt = distance) %>%
print()
## # A tibble: 4,037 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418
## 2 N0EGMQ 239143
## 3 N10156 109664
## 4 N102UW 25722
## 5 N103US 24619
## 6 N104UW 24616
## 7 N10575 139903
## 8 N105UW 23618
## 9 N107US 21677
## 10 N108UW 32070
## # ... with 4,027 more rows
sum(x > 10)
.not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarize(n_early = sum(dep_time < 500)) %>%
print()
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day n_early
## <int> <int> <int> <int>
## 1 2013 1 1 0
## 2 2013 1 2 3
## 3 2013 1 3 4
## 4 2013 1 4 3
## 5 2013 1 5 3
## 6 2013 1 6 2
## 7 2013 1 7 2
## 8 2013 1 8 1
## 9 2013 1 9 3
## 10 2013 1 10 3
## # ... with 355 more rows
mean(y == 0)
.not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarize(hour_perc = mean(arr_delay > 60)) %>%
print()
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day hour_perc
## <int> <int> <int> <dbl>
## 1 2013 1 1 0.0722
## 2 2013 1 2 0.0851
## 3 2013 1 3 0.0567
## 4 2013 1 4 0.0396
## 5 2013 1 5 0.0349
## 6 2013 1 6 0.0470
## 7 2013 1 7 0.0333
## 8 2013 1 8 0.0213
## 9 2013 1 9 0.0202
## 10 2013 1 10 0.0183
## # ... with 355 more rows
popular_dests <- flights %>%
group_by(dest) %>%
filter(n() > 365) %>%
print()
## # A tibble: 332,577 x 19
## # Groups: dest [77]
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 332,567 more rows, and 12 more variables:
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
popular_dests <- flights %>%
group_by(dest) %>%
filter(n() > 365)
popular_dests %>%
filter(arr_delay > 0) %>%
mutate(prop_delay = arr_delay / sum(arr_delay)) %>%
select(year:day, dest, arr_delay, prop_delay) %>%
print()
## # A tibble: 131,106 x 6
## # Groups: dest [77]
## year month day dest arr_delay prop_delay
## <int> <int> <int> <chr> <dbl> <dbl>
## 1 2013 1 1 IAH 11 0.000111
## 2 2013 1 1 IAH 20 0.000201
## 3 2013 1 1 MIA 33 0.000235
## 4 2013 1 1 ORD 12 0.0000424
## 5 2013 1 1 FLL 19 0.0000938
## 6 2013 1 1 ORD 8 0.0000283
## 7 2013 1 1 LAX 7 0.0000344
## 8 2013 1 1 DFW 31 0.000282
## 9 2013 1 1 ATL 12 0.0000400
## 10 2013 1 1 DTW 16 0.000116
## # ... with 131,096 more rows
“Happy families are all alike; every unhappy family is unhappy in its own way.” —- Leo Tolstoy
“Tidy datasets are all alike, but every messy dataset is messy in its own way.” —- Hadley Wickham
To learn more about the underlying theory, see the Tidy Data paper.
print(table1)
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
print(table2)
## # A tibble: 12 x 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
print(table3)
## # A tibble: 6 x 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
print(table4a)
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
print(table4b)
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 19987071 20595360
## 2 Brazil 172006362 174504898
## 3 China 1272915272 1280428583
pivot_longer()
and pivot_wider()
.tidyr 1.0
:
pivot_longer()
and pivot_wider()
to convert between long and wide formsspread()
and gather()
!vignette("pivot")
for more detailspivot_longer()
Makes datasets longer by increasing the number of rows and decreasing the number of columns.
Commonly needed to tidy datasets optimized for ease of data entry or ease of comparison rather than ease of analysis.
table4a %>%
print()
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
table4a %>%
pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases") %>%
print()
## # A tibble: 6 x 3
## country year cases
## <chr> <chr> <int>
## 1 Afghanistan 1999 745
## 2 Afghanistan 2000 2666
## 3 Brazil 1999 37737
## 4 Brazil 2000 80488
## 5 China 1999 212258
## 6 China 2000 213766
names_to/values_to
: the name of the variable created from the data stored in the column names/cell value.gather()
versiontable4a %>%
gather(`1999`, `2000`,
key = "year", value = "cases") %>%
print()
## # A tibble: 6 x 3
## country year cases
## <chr> <chr> <int>
## 1 Afghanistan 1999 745
## 2 Brazil 1999 37737
## 3 China 1999 212258
## 4 Afghanistan 2000 2666
## 5 Brazil 2000 80488
## 6 China 2000 213766
table4b
table4b %>%
pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "population") %>%
print()
## # A tibble: 6 x 3
## country year population
## <chr> <chr> <int>
## 1 Afghanistan 1999 19987071
## 2 Afghanistan 2000 20595360
## 3 Brazil 1999 172006362
## 4 Brazil 2000 174504898
## 5 China 1999 1272915272
## 6 China 2000 1280428583
table4a
and table4b
to tidy table table1
tidy4a <- table4a %>%
pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
tidy4b <- table4b %>%
pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "population")
left_join(tidy4a, tidy4b) %>%
print()
## Joining, by = c("country", "year")
## # A tibble: 6 x 4
## country year cases population
## <chr> <chr> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
relig_income %>% print(n = 10)
## # A tibble: 18 x 11
## religion `<$10k` `$10-20k` `$20-30k` `$30-40k` `$40-50k` `$50-75k`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Agnostic 27 34 60 81 76 137
## 2 Atheist 12 27 37 52 35 70
## 3 Buddhist 27 21 30 34 33 58
## 4 Catholic 418 617 732 670 638 1116
## 5 Don’t kn~ 15 14 15 11 10 35
## 6 Evangeli~ 575 869 1064 982 881 1486
## 7 Hindu 1 9 7 9 11 34
## 8 Historic~ 228 244 236 238 197 223
## 9 Jehovah'~ 20 27 24 24 21 30
## 10 Jewish 19 19 25 25 30 95
## # ... with 8 more rows, and 4 more variables: $75-100k <dbl>,
## # $100-150k <dbl>, >150k <dbl>, Don't know/refused <dbl>
relig_income %>%
pivot_longer(-religion, names_to = "income", values_to = "count") %>%
print()
## # A tibble: 180 x 3
## religion income count
## <chr> <chr> <dbl>
## 1 Agnostic <$10k 27
## 2 Agnostic $10-20k 34
## 3 Agnostic $20-30k 60
## 4 Agnostic $30-40k 81
## 5 Agnostic $40-50k 76
## 6 Agnostic $50-75k 137
## 7 Agnostic $75-100k 122
## 8 Agnostic $100-150k 109
## 9 Agnostic >150k 84
## 10 Agnostic Don't know/refused 96
## # ... with 170 more rows
billboard %>%
print()
## # A tibble: 317 x 79
## artist track date.entered wk1 wk2 wk3 wk4 wk5 wk6 wk7
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 Pac Baby~ 2000-02-26 87 82 72 77 87 94 99
## 2 2Ge+h~ The ~ 2000-09-02 91 87 92 NA NA NA NA
## 3 3 Doo~ Kryp~ 2000-04-08 81 70 68 67 66 57 54
## 4 3 Doo~ Loser 2000-10-21 76 76 72 69 67 65 55
## 5 504 B~ Wobb~ 2000-04-15 57 34 25 17 17 31 36
## 6 98^0 Give~ 2000-08-19 51 39 34 26 26 19 2
## 7 A*Tee~ Danc~ 2000-07-08 97 97 96 95 100 NA NA
## 8 Aaliy~ I Do~ 2000-01-29 84 62 51 41 38 35 35
## 9 Aaliy~ Try ~ 2000-03-18 59 53 38 28 21 18 16
## 10 Adams~ Open~ 2000-08-26 76 76 74 69 68 67 61
## # ... with 307 more rows, and 69 more variables: wk8 <dbl>,
## # wk9 <dbl>, wk10 <dbl>, wk11 <dbl>, wk12 <dbl>, wk13 <dbl>,
## # wk14 <dbl>, wk15 <dbl>, wk16 <dbl>, wk17 <dbl>, wk18 <dbl>,
## # wk19 <dbl>, wk20 <dbl>, wk21 <dbl>, wk22 <dbl>, wk23 <dbl>,
## # wk24 <dbl>, wk25 <dbl>, wk26 <dbl>, wk27 <dbl>, wk28 <dbl>,
## # wk29 <dbl>, wk30 <dbl>, wk31 <dbl>, wk32 <dbl>, wk33 <dbl>,
## # wk34 <dbl>, wk35 <dbl>, wk36 <dbl>, wk37 <dbl>, wk38 <dbl>, ...
billboard %>%
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank",
values_drop_na = TRUE
) %>%
print()
## # A tibble: 5,307 x 5
## artist track date.entered week rank
## <chr> <chr> <date> <chr> <dbl>
## 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk1 87
## 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk2 82
## 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk3 72
## 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk4 77
## 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk5 87
## 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk6 94
## 7 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk7 99
## 8 2Ge+her The Hardest Part Of ... 2000-09-02 wk1 91
## 9 2Ge+her The Hardest Part Of ... 2000-09-02 wk2 87
## 10 2Ge+her The Hardest Part Of ... 2000-09-02 wk3 92
## # ... with 5,297 more rows
names_prefix
argumentbillboard %>%
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
names_prefix = "wk",
values_to = "rank",
values_drop_na = TRUE,
) %>%
print()
## # A tibble: 5,307 x 5
## artist track date.entered week rank
## <chr> <chr> <date> <chr> <dbl>
## 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 1 87
## 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 2 82
## 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 3 72
## 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 4 77
## 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 5 87
## 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 6 94
## 7 2 Pac Baby Don't Cry (Keep... 2000-02-26 7 99
## 8 2Ge+her The Hardest Part Of ... 2000-09-02 1 91
## 9 2Ge+her The Hardest Part Of ... 2000-09-02 2 87
## 10 2Ge+her The Hardest Part Of ... 2000-09-02 3 92
## # ... with 5,297 more rows
pivot_wider()
pivot_longer()
print(table2)
## # A tibble: 12 x 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
names_from
: which columns to extract variable.values_from
: which column to extract value.table2 %>%
pivot_wider(names_from = type, values_from = count) %>%
print()
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
spread()
versiontable2 %>%
spread(key = type, value = count) %>%
print()
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
separate()
print(table3)
## # A tibble: 6 x 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
table3 %>%
separate(rate, into = c("cases", "population")) %>%
print()
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <chr> <chr>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
separate()
using convert = TRUE
table3 %>%
separate(rate, into = c("cases", "population"), convert = TRUE) %>%
print()
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
separate()
using indextable3 %>%
separate(year, into = c("century", "year"), sep = 2)
## # A tibble: 6 x 4
## country century year rate
## <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
unite()
table5 %>%
print()
## # A tibble: 6 x 4
## country century year rate
## * <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
table5 %>%
unite(new, century, year, sep = "") %>%
print()
## # A tibble: 6 x 3
## country new rate
## <chr> <chr> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583