library(dplyr)
library(explore)
<- use_data_titanic(count = FALSE)
data glimpse(data)
#> Rows: 2,201
#> Columns: 4
#> $ Class <chr> "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd"…
#> $ Sex <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male…
#> $ Age <chr> "Child", "Child", "Child", "Child", "Child", "Child", "Child"…
#> $ Survived <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "…
<- data %>% clean_var(Age, name = "age")
data glimpse(data)
#> Rows: 2,201
#> Columns: 4
#> $ Class <chr> "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd"…
#> $ Sex <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male…
#> $ age <chr> "Child", "Child", "Child", "Child", "Child", "Child", "Child"…
#> $ Survived <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "…
<- use_data_beer()
data %>% describe(energy_kcal_100ml)
data #> variable = energy_kcal_100ml
#> type = double
#> na = 11 of 161 (6.8%)
#> unique = 34
#> min|max = 20 | 62
#> q05|q95 = 24 | 56.65
#> q25|q75 = 37 | 44
#> median = 42
#> mean = 39.89333
<- data %>% clean_var(energy_kcal_100ml, na = 42)
data %>% describe(energy_kcal_100ml)
data #> variable = energy_kcal_100ml
#> type = double
#> na = 0 of 161 (0%)
#> unique = 33
#> min|max = 20 | 62
#> q05|q95 = 24 | 55
#> q25|q75 = 38 | 44
#> median = 42
#> mean = 40.03727
<- create_data_person()
data %>% describe(age)
data #> variable = age
#> type = integer
#> na = 0 of 1 000 (0%)
#> unique = 80
#> min|max = 16 | 95
#> q05|q95 = 21 | 92
#> q25|q75 = 37 | 76
#> median = 55
#> mean = 55.845
<- data %>% clean_var(age, min_val = 20, max_val = 80)
data %>% describe(age)
data #> variable = age
#> type = integer
#> na = 0 of 1 000 (0%)
#> unique = 61
#> min|max = 20 | 80
#> q05|q95 = 21 | 80
#> q25|q75 = 37 | 76
#> median = 55
#> mean = 54.276
%>% describe(income)
data #> variable = income
#> type = double
#> na = 0 of 1 000 (0%)
#> unique = 228
#> min|max = 0 | 150
#> q05|q95 = 6 | 123.025
#> q25|q75 = 35 | 88.625
#> median = 62
#> mean = 61.5875
<- data %>% clean_var(income, rescale01 = TRUE)
data %>% describe(income)
data #> variable = income
#> type = double
#> na = 0 of 1 000 (0%)
#> unique = 228
#> min|max = 0 | 1
#> q05|q95 = 0.04 | 0.820167
#> q25|q75 = 0.233333 | 0.590833
#> median = 0.4
#> mean = 0.410583
1, "handset"] <- " android "
data[2, "handset"] <- "ANDROID"
data[%>% describe(handset)
data #> variable = handset
#> type = character
#> na = 0 of 1 000 (0%)
#> unique = 5
#> android = 1 (0.1%)
#> ANDROID = 1 (0.1%)
#> Android = 471 (47.1%)
#> Apple = 430 (43%)
#> Other = 97 (9.7%)
<- data %>% clean_var(handset, simplify_text = TRUE)
data %>% describe(handset)
data #> variable = handset
#> type = character
#> na = 0 of 1 000 (0%)
#> unique = 3
#> ANDROID = 473 (47.3%)
#> APPLE = 430 (43%)
#> OTHER = 97 (9.7%)
drop_var_no_variance()
Drop all variables with no
variancedrop_var_not_numeric()
Drop all not numeric
variablesdrop_var_low_variance()
Drop all variables with low
variancedrop_var_by_names()
Drop variables by namedrop_var_with_na()
Drop all variables with
NA-values<- use_data_beer()
data %>% describe_tbl()
data #> 161 observations with 11 variables
#> 19 observations containing missings (NA)
#> 5 variables containing missings (NA)
#> 1 variables with no variance
%>%
data drop_var_no_variance() %>%
describe_tbl()
#> 161 observations with 10 variables
#> 19 observations containing missings (NA)
#> 5 variables containing missings (NA)
#> 0 variables with no variance
%>%
data drop_var_with_na() %>%
describe_tbl()
#> 161 observations with 6 variables
#> 0 observations containing missings (NA)
#> 0 variables containing missings (NA)
#> 1 variables with no variance
drop_obs_with_na()
Drop all observations with
NA-values%>%
data drop_obs_with_na() %>%
describe_tbl()
#> 142 observations with 11 variables
#> 0 observations containing missings (NA)
#> 0 variables containing missings (NA)
#> 1 variables with no variance
drop_obs_if()
Drop all observations where expression is
true%>%
data count_pct(type)
#> # A tibble: 3 × 4
#> type n total pct
#> <chr> <int> <int> <dbl>
#> 1 Alkoholfrei 27 161 16.8
#> 2 Bock 8 161 4.97
#> 3 Rest 126 161 78.3
%>%
data drop_obs_if(type == "Alkoholfrei") %>%
count_pct(type)
#> # A tibble: 2 × 4
#> type n total pct
#> <chr> <int> <int> <dbl>
#> 1 Bock 8 134 5.97
#> 2 Rest 126 134 94.0