For more details on using R Markdown see
http://rmarkdown.rstudio.com. When you click the
Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
Blogdown
Generating a site with Hugo is very fast; however, Rmd-files can take som time because all the code is executed and all the graphs are saved to disk. But the page is only regenerated when the contents have changed. On a 2GHz server with SSD, this page takes 24 seconds to produce each time it is modified. Pages with more sophisticated analysis will take longer. For complex analysis, the ability to just run the chunk you are working on in RStudio is very useful.
Drawing graphs with ggplot2
A lot examples are from the book (R for Data Science: Import, Tidy, Transform, Visualize, and Model Data)[http://amzn.to/2ixHRe4] which presents a consistent and easy to use collection of modern R packages for data science which is bundled in tidyverse library. Get the book, it’s great!
suppressMessages(library(tidyverse))
library(ggthemes)
ggplot(data = mpg) + geom_point(mapping = aes(x= displ, y = hwy, color=class)) +
theme_gdocs()
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_wrap(~ class, nrow = 2) +
theme_minimal()
front-wheel drive, r = rear wheel drive, 4 = 4wd
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color=class)) +
facet_grid(. ~ cyl) +
theme_minimal()
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color=class)) +
facet_grid(drv ~ .) +
theme_gdocs()
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color=class)) +
facet_grid(. ~ cyl) +
theme_gdocs()
ggplot(data = mpg) +
geom_smooth(mapping = aes(x = displ, y = hwy),method='loess') +
geom_point(mapping = aes(x = displ, y = hwy, color=class)) +
theme_gdocs()
ggplot(data = mpg) +
geom_smooth(mapping = aes(x = displ, y = hwy, linetype= drv), method=loess ) +
geom_point(mapping = aes(x = displ, y = hwy, color=class))
Mapping passed to the geoms:
ggplot(data = mpg,(mapping = aes(x = displ, y = hwy, linetype= drv) )) +
geom_smooth( method=loess ) +
geom_point( mapping = aes( color=class)) + # <-- extend/overwrite global aes.
theme_gdocs()
ggplot( data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point(mapping = aes(color = class), position="jitter")+
geom_smooth(
data = filter(mpg, class == 'subcompact'),
se = FALSE,
method = loess
) +
theme_gdocs()
ggplot( data = diamonds) +
geom_bar( mapping = aes(x = cut)) +
theme_gdocs()
d <- tribble (
~a, ~b,
"v1", 10,
"v2", 20
)
ggplot(data = d) +
geom_bar( mapping = aes(x = a, y = b), stat = "identity") +
coord_flip() +
theme_gdocs()
ggplot(data = diamonds, mapping = aes(x = cut, y = depth, color = clarity)) +
stat_summary(
fun.ymin = min,
fun.ymax = max,
fun.y = median
) +
theme_gdocs()
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = clarity), position = "dodge") +
theme_gdocs()
ggplot(data = mpg) +
geom_boxplot( mapping = aes( x = class,y = cty)) +
theme_gdocs()
dplyr
library(nycflights13)
library(htmlTable)
flights_to_IAH_HOU <- filter(flights,
dest %in% c("IAH","HOU") &
arr_delay > 120 &
carrier %in% c("UA","DL") &
month %in% c(7,8,9) &
between(dep_time,0000,0600))
flights_by_date <- arrange(flights, year, month, day)
flights_by_descending_date <- arrange(flights, desc(year), desc(month), desc(day))
htmlTable(select(flights_to_IAH_HOU, year:day, carrier),
caption = "Flights to honolulo with a delay of more than two hours flwn by United or Delta in july through september departing between 00 - 06")
Flights to honolulo with a delay of more than two hours flwn by United or Delta in july through september departing between 00 - 06
|
|
year
|
month
|
day
|
carrier
|
1
|
2013
|
7
|
10
|
UA
|
2
|
2013
|
7
|
28
|
UA
|
vars <- c('year','month','day','dep_delay','arr_delay')
htmlTable(select(flights_to_IAH_HOU, one_of(vars)))
|
year
|
month
|
day
|
dep_delay
|
arr_delay
|
1
|
2013
|
7
|
10
|
331
|
301
|
2
|
2013
|
7
|
28
|
348
|
324
|
Data wrangling
flights_sel <- select(flights, year:day, ends_with('delay'), distance, air_time)
mutate(flights_sel,
gain = arr_delay - dep_delay,
speed = distance / air_time * 60
)
## # A tibble: 336,776 x 9
## year month day dep_delay arr_delay distance air_time gain speed
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 9 370.
## 2 2013 1 1 4 20 1416 227 16 374.
## 3 2013 1 1 2 33 1089 160 31 408.
## 4 2013 1 1 -1 -18 1576 183 -17 517.
## 5 2013 1 1 -6 -25 762 116 -19 394.
## 6 2013 1 1 -4 12 719 150 16 288.
## 7 2013 1 1 -5 19 1065 158 24 404.
## 8 2013 1 1 -3 -14 229 53 -11 259.
## 9 2013 1 1 -3 -8 944 140 -5 405.
## 10 2013 1 1 -2 8 733 138 10 319.
## # ... with 336,766 more rows
transmute(flights_sel,
gain = arr_delay - dep_delay,
speed = distance / air_time * 60
)
## # A tibble: 336,776 x 2
## gain speed
## <dbl> <dbl>
## 1 9 370.
## 2 16 374.
## 3 31 408.
## 4 -17 517.
## 5 -19 394.
## 6 16 288.
## 7 24 404.
## 8 -11 259.
## 9 -5 405.
## 10 10 319.
## # ... with 336,766 more rows
delay_mean <- summarize(flights,delay = mean(dep_delay, na.rm = TRUE))
print(delay_mean)
## # A tibble: 1 x 1
## delay
## <dbl>
## 1 12.6
by_month <- group_by(flights, month)
delay_month <- summarize(by_month, delay = mean(dep_delay, na.rm = TRUE))
delay_month <- mutate(delay_month, small_delay = (delay < 15) )
ggplot(data = delay_month, mapping = aes(x = month,y = delay, fill=small_delay)) +
geom_bar(stat = "identity") +
scale_x_discrete(limits = month.abb) +
theme_gdocs()
delay <- flights %>%
group_by(dest) %>%
summarize(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
ggplot(data = delay, mapping = aes( x = dist, y = delay)) +
geom_point( aes(size = count), alpha = 1/1) +
geom_smooth( se = FALSE, method = loess)
not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay))
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarize(
delay = mean(arr_delay)
)
ggplot(data = delays, mapping = aes(x = delay)) +
geom_area(stat="bin",binwidth = 10) +
geom_freqpoly(binwidth = 10)