Work on questions in R, make sure to keep a copy of your R code - you will be asked to submit this script at the end.
Data on all flights in and out of Des Moines (DSM) for October 2008
are available at
http://www.hofroe.net/data/dsm-flights.csv.
See
http://stat-computing.org/dataexpo/2009/the-data.html
for a description of the variables.
Update: the link to the csv file above is not valid anymore. Please use the following code to load the data
flights <- read.csv("https://raw.githubusercontent.com/DS202-at-ISU/DS202-at-ISU.github.io/master/exams/dsm-flights.csv")
flights <- read.csv("https://raw.githubusercontent.com/DS202-at-ISU/DS202-at-ISU.github.io/master/exams/dsm-flights.csv")
# which flight was delayed the worst? - where did the flight start?
# was it delayed when departing?
which.max(flights$ArrDelay)
## [1] 1516
# 1516
flights[which.max(flights$ArrDelay), c("Origin", "DepDelay")]
## Origin DepDelay
## 1516 DSM 614
# Origin DepDelay
#1516 DSM 614
summary(flights$Day)
## Length Class Mode
## 2687 character character
# Friday Monday Saturday Sunday Thursday Tuesday Wednesday
# 456 368 248 336 460 360 459
days <- levels(flights$Day)
flights$Day <- factor(flights$Day, levels=days[c(2,6,7,5,1,3,4)])
summary(flights$Day)
## NA's
## 2687
# Monday Tuesday Wednesday Thursday Friday Saturday Sunday
# 368 360 459 460 456 248 336
# create new variable Weekend
flights$Weekend <- flights$Day %in% c("Saturday", "Sunday")
summary(flights$Weekend)
## Mode FALSE
## logical 2687
# Mode FALSE TRUE NA's
#logical 2103 584 0
# idea 1:
table(subset(flights, Dest=="DSM")$Day) # overall number of flights by day of week
## < table of extent 0 >
# Monday Tuesday Wednesday Thursday Friday Saturday Sunday
# 184 180 230 230 228 124 168
# problem: how many Mondays, Tuesdays, are there in October 2008?
require(lubridate)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
octs <- data.frame( date = ymd(paste("2008/10/",1:31, sep="")))
octs$day = wday(octs$date, label=TRUE)
table(octs$day)
##
## Sun Mon Tue Wed Thu Fri Sat
## 4 4 4 5 5 5 4
table(subset(flights, Dest=="DSM")$Day)/c(4,4,5,5,5,4,4)
## numeric(0)
# idea 2:
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(lubridate)
flights %>% filter(Dest == "DSM") %>% group_by(DayofMonth) %>% summarise(
day = Day[1],
n = n()
) %>% group_by(day) %>% summarize(avg = mean(n), n = n())
## # A tibble: 1 × 3
## day avg n
## <fct> <dbl> <int>
## 1 <NA> 43.4 31
nrow(subset(flights, Dest=='DEN'))
## [1] 145
nrow(subset(flights, Dest=='DEN'))/nrow(subset(flights, Dest != 'DSM'))*100
## [1] 10.79672
sort(table(flights$Origin), decreasing=T)[2]
## ORD
## 379
library(ggplot2)
flights %>% filter(Dest =="DSM") %>%
ggplot( aes(x = reorder(factor(Origin), ArrDelay, na.rm=T), y = ArrDelay)) + geom_boxplot()
## Warning: Removed 8 rows containing non-finite values (`stat_boxplot()`).
Draw a scatterplot of average departure delay by scheduled hour of departure. Color points by top destination, adjust point size to reflect the number of flights for each hour.
dep.summary <- flights %>% filter(Origin == 'DSM') %>% mutate(hour = CRSDepTime%/%100) %>% group_by(hour) %>%
summarise(
count = n(),
pct.delayed = sum(DepDelay>15, na.rm=TRUE)/n()*100,
avg.delay = mean(DepDelay, na.rm=T),
top.Dest.1=names(sort(table(Dest), decreasing=T))[1],
top.Dest.2=names(sort(table(Dest), decreasing=T))[2],
top.Dest.3=names(sort(table(Dest), decreasing=T))[3]
)
dep.summary %>%
ggplot(aes(x = hour, avg.delay, colour = top.Dest.1, size = count)) + geom_point()