Load some necessary packages.
library(dplyr)
library(ggplot2)
library(stringr)
library(Lahman)
library(readr)
The ESPN home run tracker http://www.hittrackeronline.com/ contains a number of variables for each home run hit during the current season. I collected this data for five baseball seasons (2012 through 2016) and the csv file homeruns.csv contains data on 24,299 home runs hit during these five seasons.
d <- read_csv("https://bayesball.github.io/VB/data/homeruns.csv")
head(d)
## # A tibble: 6 x 16
## Date Video Path Hitter H_Team Pitcher P_Team
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 10/3/12 Video View Longoria, Evan TB Arrieta, Jake BAL
## 2 10/3/12 Video View Johnson, Dan CHW Huff, David CLE
## 3 10/3/12 Video View Maybin, Cameron SD Kintzler, Brandon MIL
## 4 10/3/12 Video View Cano, Robinson NYY Mortensen, Clayton BOS
## 5 10/3/12 Video View Moore, Tyler WSH Lee, Cliff PHI
## 6 10/3/12 Video View Longoria, Evan TB Tillman, Chris BAL
## # ... with 9 more variables: Inning <int>, Ballpark <chr>,
## # `Type/Luck` <chr>, True_Dist <int>, Speed_off_Bat <dbl>,
## # Elevation_Angle <dbl>, Horiz_Angle <dbl>, Apex <int>, N_Parks <int>
In the book, I define the horizontal angle which is 180 - Horiz_Angle where Horiz_Angle
is the definition of the horizontal angle on the website.
Here is a density plot of the collection of horizontal angles.
ggplot(d, aes(180 - Horiz_Angle)) +
geom_density() +
xlim(30, 180 - 30) +
xlab("Horizontal Angle") +
ylab("Density") +
geom_vline(xintercept=90) +
annotate("text", x=40, y=0.015,
label="Left\nField", size=6) +
annotate("text", x=140, y=0.015,
label="Right\nField", size=6)
Here I graph the horizontal angle against the home run distance and add a smoothing curve to show the general pattern.
ggplot(d, aes(180 - Horiz_Angle, True_Dist)) +
geom_point(alpha=0.1) + geom_smooth() +
ylim(300, 500) + xlim(45, 130) +
xlab("Horizontal Angle") +
ylab("Distance")
Here I get information about the batting side of each hitter and merge this information with the main dataset.
Names <- str_split(d$Hitter, ",")
one_row <- function(j, k)
str_trim(Names[[j]][k])
d$LastName <- sapply(1:24299, one_row, 1)
d$FirstName <- sapply(1:24299, one_row, 2)
d2 <- inner_join(d,
select(Master, nameLast, nameFirst, bats),
by=c("LastName"="nameLast",
"FirstName"="nameFirst"))
d2$Batting <- ifelse(d2$bats=="R",
"Right-Handed Hitter",
"Left-Handed Hitter")
Here I look the right and left batter effects – show how the distribution of the horizontal angle varies among right and left-handed hitters.
ggplot(filter(d2, bats=="R" | bats=="L"),
aes(180 - Horiz_Angle)) +
geom_density(size=1.0) + xlim(45, 130) +
xlab("Horizontal Angle") +
ylab("Density") +
facet_wrap(~ Batting, ncol=1) +
theme(strip.text = element_text(face="bold", size=16))
Here I look at the proportion of left-sided hr for all parks (Figure 6.7)
S <- summarise(group_by(d2, Ballpark),
NL=sum(180 - Horiz_Angle < 90),
NR=sum(180 - Horiz_Angle > 90),
PL=NL / (NL + NR))
ggplot(filter(S, NL + NR > 200), aes(Ballpark, PL)) +
geom_point() + coord_flip() +
ylab("Proportion of Home Runs to Left") +
geom_hline(yintercept = 0.5)
I focus on 12 extreme parks
S200 <- filter(S, NL + NR > 200)
S200 <- arrange(S200, desc(PL))
Sextreme <- rbind(slice(S200, 1:8),
slice(S200, 28:31))
ballparks <- as.character(arrange(Sextreme, PL)$Ballpark)
d2$Ballpark <- factor(d2$Ballpark,
levels=ballparks)
(Figure 6.8) This shows the distribution of the horizontal angle for each of these extreme parks.
ggplot(filter(d2, bats=="R" | bats=="L",
Ballpark %in% Sextreme$Ballpark),
aes(180 - Horiz_Angle)) +
geom_density() +
facet_wrap(~ Ballpark, ncol=4) +
geom_vline(xintercept = 90, color="blue") +
xlab("Horizontal Angle") + ylab("Density")