For our analysis of the German election data, we prepared a script in the statistical programming environment R. The script was divided in two main parts: A preprocessing part in which we cleaned and structured the raw election data as we got it directly from the electoral management body, and the analysis part that transformed the election data into tables, models and maps.
All steps in the preprocessing were made to fit the data from the 2013 elections and were adjusted for the 2017 data when the first extrapolations were published. In this Notebook we show how we structured the original dataset in order to work with it and how we created the small multiples published in our project.
# install.packages("needs");library(needs)
needs(tidyverse, magrittr, rgdal, grid, gridExtra, mapproj, rgeos) # loading needed packages
# tidyverse, magrittr, stringr: to manipulate and structure data
# rgdal, mapproj, rgeos: to load geospatial data, apply map projections and calculate shape centroids
# grid, gridExtra: to arrange the maps in a grid for this Notebook
d <- read.csv("https://www.bundeswahlleiter.de/dam/jcr/72f186bb-aa56-47d3-b24c-6a46f5de22d0/btw17_kerg.csv", header=F, sep=";", skip=1, stringsAsFactors = F, na.strings="")
head(d[1:9])
## V1
## 1 #
## 2 # Wahl zum 19. Deutschen Bundestag (24. September 2017)
## 3 # Endgültige Ergebnisse der Erst- und Zweitstimmen sowie der Vorperiode nach Wahlkreisen
## 4 #
## 5 Nr
## 6 <NA>
## V2 V3 V4 V5 V6 V7 V8 V9
## 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 4 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 5 Gebiet gehört zu Wahlberechtigte <NA> <NA> <NA> Wähler <NA>
## 6 <NA> <NA> Erststimmen <NA> Zweitstimmen <NA> Erststimmen <NA>
# paste party names to second vote ("Zweitstimme")
for(i in 1:ncol(d)){
if(d[6,i] %in% "Zweitstimmen"){
d[6,i] <- paste(d[5,i-2], d[6,i])
}
}
colnames(d) <- d[6,] # set new column names
d <- d[-(1:7),] # remove extra rows
d <- d[d[3] != "99" & d[2] != "Bundesgebiet",] # remove totals of states and country
d <- cbind(d[1:2], d[grepl("Zweitstimmen$",colnames(d))]) # only keep second vote
colnames(d)[1:2] <- c("wkrnr","wahlkreisname") # rename first and second column
d <- d[!(is.na(d$wkrnr)),] # remove rows with missing values
d[-2] <- apply(d[-2],2,as.numeric) # save votes as numeric data
# add CDU and CSU votes to UNION
d %<>% group_by(wkrnr) %>% mutate(`UNION Zweitstimmen` = sum(`Christlich Demokratische Union Deutschlands Zweitstimmen`,`Christlich-Soziale Union in Bayern e.V. Zweitstimmen`, na.rm=T)) %>% select(-c(`Christlich Demokratische Union Deutschlands Zweitstimmen`, `Christlich-Soziale Union in Bayern e.V. Zweitstimmen`))
# Convert votes to percentages
d[grepl("Zweitstimmen$",colnames(d))][-(1:4)] <- round(d[grepl("Zweitstimmen$",colnames(d))][-(1:4)]/d$`Gültige Zweitstimmen`, 3)
d %<>% mutate(`Nichtwähler Zweitstimmen`=1-(`Wähler Zweitstimmen`/`Wahlberechtigte Zweitstimmen`), id=wkrnr-1, id=as.character(id)) %>% # add id to fit shape ids
gather(key="party", value="value", 7:48) %>% mutate(fill="#129B93") # convert to tidy data
head(d[1:6]) # take a look at the now tidy data
## # A tibble: 6 x 6
## # Groups: wkrnr [6]
## wkrnr wahlkreisname `Wahlberechtigte Zweitstimmen`
## <dbl> <chr> <dbl>
## 1 1 Flensburg – Schleswig 228471
## 2 2 Nordfriesland – Dithmarschen Nord 186568
## 3 3 Steinburg – Dithmarschen Süd 176636
## 4 4 Rendsburg-Eckernförde 200831
## 5 5 Kiel 204650
## 6 6 Plön – Neumünster 174937
## # ... with 3 more variables: `Wähler Zweitstimmen` <dbl>, `Ungültige
## # Zweitstimmen` <dbl>, `Gültige Zweitstimmen` <dbl>
Now that the main preprocessing is done, we can start constructing the maps.
# load state borders and shapes of the constituencies
# You can download them here: https://interaktiv.morgenpost.de/analyse-bundestagswahl-2017/data/btw17-shapes.zip
bund_shp <- readOGR("btw17-shapes/bundeslaender_small.shp", "bundeslaender_small", stringsAsFactors=FALSE, encoding="latin1") %>% broom::tidy()
## OGR data source with driver: ESRI Shapefile
## Source: "btw17-shapes/bundeslaender_small.shp", layer: "bundeslaender_small"
## with 16 features
## It has 4 fields
## Integer64 fields read as strings: WKR_NR
wahlkreise <- readOGR("btw17-shapes/wahlkreise_small.shp", "wahlkreise_small", stringsAsFactors=FALSE, encoding="latin1")
## OGR data source with driver: ESRI Shapefile
## Source: "btw17-shapes/wahlkreise_small.shp", layer: "wahlkreise_small"
## with 299 features
## It has 4 fields
## Integer64 fields read as strings: WKR_NR
krs_shp <- wahlkreise %>% broom::tidy() # broom to tidy shape data to make it work smoothly with the graphic package ggplot2
btw_plot <- merge(krs_shp, d, by="id", all.y=T) # merge shapes with vote data by id
head(btw_plot[1:6])
## id long lat order hole piece
## 1 0 9.676908 54.83785 1 FALSE 1
## 2 0 9.676908 54.83785 1 FALSE 1
## 3 0 9.676908 54.83785 1 FALSE 1
## 4 0 9.676908 54.83785 1 FALSE 1
## 5 0 9.676908 54.83785 1 FALSE 1
## 6 0 9.676908 54.83785 1 FALSE 1
# generate color gradient for each of the established parties and add it to the data depending on how high the vote was
gewinner_plot <- btw_plot
gewinner_plot[gewinner_plot$party %in% "Sozialdemokratische Partei Deutschlands Zweitstimmen",]$fill <-
colorRampPalette(c('#F0BAA5','#f40502'))(10)[as.numeric(cut(gewinner_plot[gewinner_plot$party %in% "Sozialdemokratische Partei Deutschlands Zweitstimmen",]$value, breaks = 10))]
gewinner_plot[gewinner_plot$party %in% "UNION Zweitstimmen",]$fill <-
colorRampPalette(c('#959595','#000000'))(10)[as.numeric(cut(gewinner_plot[gewinner_plot$party %in% "UNION Zweitstimmen",]$value, breaks=10))]
gewinner_plot[gewinner_plot$party %in% "Alternative für Deutschland Zweitstimmen",]$fill <-
colorRampPalette(c('#CEE9FF','#009de0'))(10)[as.numeric(cut(gewinner_plot[gewinner_plot$party %in% "Alternative für Deutschland Zweitstimmen",]$value, breaks=10))]
gewinner_plot[gewinner_plot$party %in% "DIE LINKE Zweitstimmen",]$fill <-
colorRampPalette(c('#E0A7C3','#8b1b62'))(10)[as.numeric(cut(gewinner_plot[gewinner_plot$party %in% "DIE LINKE Zweitstimmen",]$value, breaks=10))]
gewinner_plot[gewinner_plot$party %in% "Freie Demokratische Partei Zweitstimmen",]$fill <-
colorRampPalette(c('#FFFBD1','#feed01'))(10)[as.numeric(cut(gewinner_plot[gewinner_plot$party %in% "Freie Demokratische Partei Zweitstimmen",]$value, breaks=10))]
gewinner_plot[gewinner_plot$party %in% "BÃœNDNIS 90/DIE GRÃœNEN Zweitstimmen",]$fill <-
colorRampPalette(c('#A4D78F','#42a62a'))(10)[as.numeric(cut(gewinner_plot[gewinner_plot$party %in% "BÃœNDNIS 90/DIE GRÃœNEN Zweitstimmen",]$value, breaks=10))]
# filter constituencies for winning party
gewinner_plot %<>% group_by(id) %>% filter(party %in% party[value %in% max(value, na.rm=T)]) %>% arrange(id,order)
win <- ggplot(data=gewinner_plot, aes(x=long, y=lat, group=group)) +
geom_polygon(aes(fill=fill), show.legend = T) + # fill shapes with generated color
# add state borders
geom_polygon(data=bund_shp, aes(x=long, y=lat, group=group), fill=NA, color="white", size=0.4) +
scale_fill_identity() +
theme_void() + # remove axes
coord_map() # apply projection
win
# add colors, this time a fixed color per party
btw_plot$cols <- "#129B93"
btw_plot$cols[btw_plot$party %in% "Sozialdemokratische Partei Deutschlands Zweitstimmen"] <- "#DB4240"
btw_plot$cols[btw_plot$party %in% "BÃœNDNIS 90/DIE GRÃœNEN Zweitstimmen"] <- "#4BA345"
btw_plot$cols[btw_plot$party %in% "DIE LINKE Zweitstimmen"] <- "#96276E"
btw_plot$cols[btw_plot$party %in% "Freie Demokratische Partei Zweitstimmen"] <- "#F6BB00"
btw_plot$cols[btw_plot$party %in% "Alternative für Deutschland Zweitstimmen"] <- "#34A3D2"
btw_plot$cols[btw_plot$party %in% "UNION Zweitstimmen"] <- "#373737"
# loop through the parties and save the plot as an element of a list
plotlist <- list() # create empty list to fill with plots
for(i in 1:length(unique(btw_plot$party)))
local({
i <- i
# filter data to only contain current party
plotdata <- btw_plot %>% filter(party %in% unique(btw_plot$party)[i]) %>% arrange(id,order)
# save plot as list element
plotlist[[i]] <<- ggplot(data=plotdata, aes(x=long, y=lat, group=group)) +
geom_polygon(aes(fill=value), show.legend = F) +
# add state borders
geom_polygon(data=bund_shp, aes(x=long, y=lat, group=group), fill=NA, color="white", size=0.2) +
theme_void() +
# gradient from #F4F4F4 to party color depending on how high vote was, scaled individually
scale_fill_gradient(space = "Lab", na.value = "#F4F4F4", name=" ", low="#F4F4F4", high=unique(plotdata$cols), guide=FALSE) +
theme(line = element_blank(),
text = element_blank(),
title = element_blank())+
coord_map()
})
do.call("grid.arrange", c(plotlist, ncol=7)) # arrange the maps for the Notebook output
# to calculate gain or loss compared to vote from 2013
btw13 <- read.csv("https://www.bundeswahlleiter.de/dam/jcr/36efa904-5d4a-4159-a11e-2c8ecc1b0f77/btwkr17_umrechnung_btw13.csv", sep=";", header=F, stringsAsFactors = F, na.strings="", fileEncoding = "latin1")
# again, we have to tidy up the data sheet
colnames(btw13) <- paste(btw13[5,], btw13[6,])
btw13 <- btw13[-(1:6),] # remove extra rows
btw13 <- btw13[!(btw13[[3]] %in% c("Insgesamt", "Land")),] # remove totals of states and country
btw13 <- cbind(btw13[1:2], btw13[grepl("Zweitstimmen$",colnames(btw13))]) # only keep second vote
# change column names to match with the party names of the 2017 dataset
colnames(btw13)[c(1:2,6:9,26)] <- c("wkrnr","wahlkreisname","Sozialdemokratische Partei Deutschlands Zweitstimmen","Freie Demokratische Partei Zweitstimmen","DIE LINKE Zweitstimmen","BÜNDNIS 90/DIE GRÜNEN Zweitstimmen","Alternative für Deutschland Zweitstimmen")
btw13[-2] <- apply(btw13[-2],2,as.numeric) # save votes as numeric data
# add CDU and CSU votes to UNION
btw13 %<>% group_by(wkrnr) %>% mutate(`UNION Zweitstimmen` = sum(`CDU Zweitstimmen`,`CSU Zweitstimmen`, na.rm=T)) %>% select(-c(`CDU Zweitstimmen`, `CSU Zweitstimmen`))
# Convert votes to percentages
btw13[grepl("Zweitstimmen$",colnames(btw13))][-(1:2)] <- round(btw13[grepl("Zweitstimmen$",colnames(btw13))][-(1:2)]/btw13$`Gültige Zweitstimmen`, 3)
# calculate and add centroids of the constuencies
btw13 <- cbind(as.data.frame(btw13),as.data.frame(gCentroid(wahlkreise, byid=TRUE))) %>% gather(key=party, value=value, 5:33)
head(btw13[1:6]) # take a look at the now tidy data
## wkrnr wahlkreisname Ungültige Zweitstimmen Gültige Zweitstimmen
## 1 1 SH 2113 160636
## 2 2 SH 1483 130044
## 3 3 SH 1451 124958
## 4 4 SH 1616 147967
## 5 5 SH 1483 144969
## 6 6 SH 1520 125573
## x y
## 1 9.509453 54.63156
## 2 8.944818 54.52165
## 3 9.405561 53.96564
## 4 9.778875 54.29168
## 5 10.132485 54.33390
## 6 10.315654 54.20776
# we merge the 2017 data with the 2013 data, filter for our six parties of interest and calculate the gain/loss. Finally we merge the result with the shapefile
btw_verluste <- d %>% merge(.,btw13[c(1,5:8)], by=c("wkrnr","party")) %>%
filter(party %in% c("Sozialdemokratische Partei Deutschlands Zweitstimmen","BÜNDNIS 90/DIE GRÜNEN Zweitstimmen","DIE LINKE Zweitstimmen","Freie Demokratische Partei Zweitstimmen","Alternative für Deutschland Zweitstimmen","UNION Zweitstimmen")) %>%
group_by(id, party) %>% mutate(verlust=value.x-value.y) %>% select(c("id","party","verlust","x","y")) %>%
merge(krs_shp, ., by="id", all.y=T) %>% arrange(id, order)
# add colors for gain...
btw_verluste$cols[btw_verluste$party %in% "Sozialdemokratische Partei Deutschlands Zweitstimmen"] <- "#DB4240"
btw_verluste$cols[btw_verluste$party %in% "BÃœNDNIS 90/DIE GRÃœNEN Zweitstimmen"] <- "#4BA345"
btw_verluste$cols[btw_verluste$party %in% "DIE LINKE Zweitstimmen"] <- "#96276E"
btw_verluste$cols[btw_verluste$party %in% "Freie Demokratische Partei Zweitstimmen"] <- "#F6BB00"
btw_verluste$cols[btw_verluste$party %in% "Alternative für Deutschland Zweitstimmen"] <- "#34A3D2"
btw_verluste$cols[btw_verluste$party %in% "UNION Zweitstimmen"] <- "#373737"
# and for loss
btw_verluste$cols_d[btw_verluste$party %in% "Sozialdemokratische Partei Deutschlands Zweitstimmen"] <- "#AB8F8F"
btw_verluste$cols_d[btw_verluste$party %in% "BÃœNDNIS 90/DIE GRÃœNEN Zweitstimmen"] <- "#A1B7A0"
btw_verluste$cols_d[btw_verluste$party %in% "DIE LINKE Zweitstimmen"] <- "#A997A3"
btw_verluste$cols_d[btw_verluste$party %in% "Freie Demokratische Partei Zweitstimmen"] <- "#C3B8A0"
btw_verluste$cols_d[btw_verluste$party %in% "Alternative für Deutschland Zweitstimmen"] <- "#93ABB5"
btw_verluste$cols_d[btw_verluste$party %in% "UNION Zweitstimmen"] <- "#999999"
# loop through the parties and save each plot as an element of a list
plotlist <- list()
for(i in 1:length(unique(btw_verluste$party)))
local({
i <- i
plotdata <- btw_verluste %>% filter(party %in% unique(btw_verluste$party)[i]) %>%
arrange(id,order)
# calculate the three coordinates of each triangle
verlust1 <- btw_verluste %>% filter(party %in% unique(btw_verluste$party)[i]) %>%
mutate(y=y+(verlust*10), t_id=1) # top of the triangle, y1
verlust2 <- btw_verluste %>% filter(party %in% unique(btw_verluste$party)[i]) %>%
mutate(x=x-0.1, t_id=2) # left base, x2
verlust3 <- btw_verluste %>% filter(party %in% unique(btw_verluste$party)[i]) %>%
mutate(x=x+0.1, t_id=3) # right base, x3
# factorize gain and loss compared to 2013
verlust <- rbind(verlust1,verlust2,verlust3) %>% arrange(id,t_id) %>%
mutate(td=ifelse(verlust>=0, "gain", "loss")) %>%
mutate(td=as.factor(td))
# save plot as list element
plotlist[[i]] <<- ggplot(data=plotdata, aes(x=long, y=lat, group=group)) +
geom_polygon(fill=NA, color=NA) +
geom_polygon(data=bund_shp, aes(x=long, y=lat, group=group), fill="#F0F0F0", color="#CCCCCC", size=0.5) +
geom_polygon(data=verlust, mapping=aes(x=x, y=y, group=id, color = NA, fill = factor(td))) +
# take different color for gain and loss
scale_fill_manual(values = c("loss"=alpha(unique(plotdata$cols_d), 0.7),
"gain"=alpha(unique(plotdata$cols),0.8))) +
scale_color_manual(values = c("loss"=alpha(unique(plotdata$cols_d), 0.7),
"gain"=alpha(unique(plotdata$cols),0.8))) +
guides(size=F, shape=F, fill=F, color=F, alpha=F) +
theme_void() +
theme(line = element_blank(),
text = element_blank(),
title = element_blank())+
coord_map()
})
do.call("grid.arrange", c(plotlist, ncol=3)) # plot the list in a grid