Chapter 3 Data transformation
3.2 health data
Firstly, we read the health data from cms government website.
read_health_data <- function() {
url = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/NationalHealthExpendData/Downloads/resident-state-estimates.zip'
temp <- tempfile()
temp2 <- tempfile()
# download the zipfile
download.file(url, temp)
# unzip the file
unzip(zipfile = temp, exdir = temp2)
# read_csv, only select the state and 2014 column
health_per_capita <- read_csv(file.path(temp2, "US_PER_CAPITA14.CSV")) %>%
filter(`Item` == 'Personal Health Care ($)') %>%
select('State_Name', 'Y2014') %>%
na.omit()
names(health_per_capita) = c("state", "health_cost_per_captia")
return(health_per_capita)
}
health_data <- read_health_data()
head(health_data)
## # A tibble: 6 x 2
## state health_cost_per_captia
## <chr> <dbl>
## 1 Alabama 7281
## 2 Alaska 11064
## 3 Arizona 6452
## 4 Arkansas 7408
## 5 California 7549
## 6 Colorado 6804
3.3 covid data
Secondly, we read the covid_19 data before November 30, 2020 and change the column name to make it more readable.
url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"
confirmed_df = read.csv(paste0(url, "time_series_covid19_confirmed_US.csv"))
# colnames(confirmed_df) = gsub("X","", colnames(confirmed_df))
death_df = read.csv(paste0(url, "time_series_covid19_deaths_US.csv"))
# colnames(death_df) = gsub("X","", colnames(death_df))
# function for transform covid time series data
transform_confirmed <- function(df) {
info_col = df[,c("UID","Admin2","Province_State","Lat","Long_")]
Nov_30 = df["X11.30.20"]
df = cbind(info_col, Nov_30)
# rename columns
ColName = c("UID","county","state","lat", "long", paste0('confirmed', "_Nov_30"))
for (i in 1:6){
names(df)[i] = paste(ColName[i])
}
return(df)
}
# population column is in the death time series data, use another method
transform_death <- function(df) {
info_col = df[,c("UID","Population")]
Nov_30 = df["X11.30.20"]
df = cbind(info_col, Nov_30)
# rename columns
ColName = c("UID","population", paste0('death', "_Nov_30"))
for (i in 1:3){
names(df)[i] = paste(ColName[i])
}
return(df)
}
# transform the two dataframe for Nov 30th only
confirmed_df_Nov_30 = transform_confirmed(confirmed_df)
death_df_Nov_30 = transform_death(death_df)
# merge confirmed and death by UID
covid_merged = merge(confirmed_df_Nov_30, death_df_Nov_30, by = "UID") # check:3340 rows, 8 columns
# add confirm_rate column
covid_merged$confirmed_rate = covid_merged$confirmed_Nov_30 / covid_merged$population
# get death_rate: death / confirmed
#covid_merged$death_Nov_30 = as.numeric(covid_merged$death_Nov_30)
covid_merged$death_rate = ifelse(covid_merged$confirmed_Nov_30>0,
covid_merged$death_Nov_30/covid_merged$confirmed_Nov_30,
0)
# if population == 0 remove this row
covid_merged = covid_merged[covid_merged['population']!=0,]
head(covid_merged)
## UID county state lat long
## 1 16 American Samoa -14.27100 -170.13200
## 2 316 Guam 13.44430 144.79370
## 3 580 Northern Mariana Islands 15.09790 145.67390
## 4 850 Virgin Islands 18.33580 -64.89630
## 5 63072001 Adjuntas Puerto Rico 18.18012 -66.75437
## 6 63072003 Aguada Puerto Rico 18.36025 -67.17513
## confirmed_Nov_30 population death_Nov_30 confirmed_rate death_rate
## 1 0 55641 0 0.000000000 0.00000000
## 2 6852 164229 112 0.041722229 0.01634559
## 3 106 55144 2 0.001922240 0.01886792
## 4 1544 107268 23 0.014393855 0.01489637
## 5 159 19483 0 0.008160961 0.00000000
## 6 428 41959 0 0.010200434 0.00000000
We want to find the relationship between covid_19 and election through time. So, we generate new dataframe confirmed_d3 and confirmed_TS.
transform_confirmed_d3 <- function(df) {
info_col = df[,c("UID","Admin2","Province_State","Lat","Long_")]
dates = df[,c("X3.30.20","X5.30.20","X7.30.20","X9.30.20","X11.30.20")]
df = cbind(info_col, dates)
# rename columns
ColName = c("UID","county","state","lat", "long")
for (i in 1:5){
names(df)[i] = paste(ColName[i])
}
return(df)
}
confirmed_d3 = transform_confirmed_d3(confirmed_df)
subdeath = subset(death_df,select = c("UID","Population"))
confirmed_d3 = merge(confirmed_d3,subdeath, by="UID")
# cr means confirmed rate
confirmed_d3$cr3 = confirmed_d3$X3.30.20/confirmed_d3$Population
confirmed_d3$cr5 = confirmed_d3$X5.30.20/confirmed_d3$Population
confirmed_d3$cr7 = confirmed_d3$X7.30.20/confirmed_d3$Population
confirmed_d3$cr9 = confirmed_d3$X9.30.20/confirmed_d3$Population
confirmed_d3$cr11 = confirmed_d3$X11.30.20/confirmed_d3$Population
head(confirmed_d3)
## UID county state lat long X3.30.20
## 1 16 American Samoa -14.27100 -170.13200 0
## 2 316 Guam 13.44430 144.79370 58
## 3 580 Northern Mariana Islands 15.09790 145.67390 0
## 4 850 Virgin Islands 18.33580 -64.89630 30
## 5 63072001 Adjuntas Puerto Rico 18.18012 -66.75437 0
## 6 63072003 Aguada Puerto Rico 18.36025 -67.17513 0
## X5.30.20 X7.30.20 X9.30.20 X11.30.20 Population cr3 cr5
## 1 0 0 0 0 55641 0.0000000000 0.0000000000
## 2 172 356 2488 6852 164229 0.0003531654 0.0010473181
## 3 22 42 70 106 55144 0.0000000000 0.0003989555
## 4 69 385 1318 1544 107268 0.0002796733 0.0006432487
## 5 7 29 77 159 19483 0.0000000000 0.0003592876
## 6 10 36 172 428 41959 0.0000000000 0.0002383279
## cr7 cr9 cr11
## 1 0.0000000000 0.000000000 0.000000000
## 2 0.0021677049 0.015149578 0.041722229
## 3 0.0007616422 0.001269404 0.001922240
## 4 0.0035891412 0.012286982 0.014393855
## 5 0.0014884771 0.003952163 0.008160961
## 6 0.0008579804 0.004099240 0.010200434
confirmed_TS <- confirmed_df %>% select(Combined_Key,UID,Province_State, c(`X1.22.20`:`X11.30.20`))
death_TS <- death_df %>% select(Combined_Key, UID,Province_State, Population, c(`X1.22.20`:`X11.30.20`))
confirmed_TS_merge = merge(confirmed_df, death_df%>%select(UID, Population), by = "UID")
confirmed_TS_merge = confirmed_TS_merge[confirmed_TS_merge['Population'] != 0, ]
3.4 election data
The last dataset we want to consider is the 2020 election data in USA.
url_election = "https://raw.githubusercontent.com/JessieYee/EDAV_Covid19_Research/main/election2020.csv"
election = read.csv(url_election)
#election$Combined_Key = paste0(election$County, ", ", election$State, ", US")
election[] <- lapply(election, gsub, pattern=',', replacement='')
election["state"]<-lapply(election["state"], gsub, pattern='-', replacement=' ')
election$Biden = as.numeric(election$Biden)
election$Trump = as.numeric(election$Trump)
election$Biden_support_rate = election$Biden / (election$Biden+election$Trump)
head(election)
## state county Biden Trump Biden_support_rate
## 1 Alabama Autauga 7503 19838 0.27442303
## 2 Alabama Baldwin 24578 83544 0.22731729
## 3 Alabama Barbour 4816 5622 0.46139107
## 4 Alabama Bibb 1986 7525 0.20881085
## 5 Alabama Blount 2640 24711 0.09652298
## 6 Alabama Bullock 3446 1146 0.75043554
We generate new variable: Biden_support_rate to calculate the support rate of Biden. For Biden_support_rate>0.5, we can conclude that more than half of people in this state support Biden. For Biden_support_rate<0.5, we can conclude that more than half of people in this state support Trump.
3.5 merge data
- left join the covid_19 data and election
covid_merge_all <- merge(covid_merged, election, by=c("state","county"),all.x = TRUE,all.y = FALSE)
covid_state = covid_merge_all %>%
group_by(`state`) %>%
summarize(`confirmed_Nov_30` = sum(confirmed_Nov_30),
`death_Nov_30` = sum(death_Nov_30),
`population` = sum(population),
`confirmed_rate` =`confirmed_Nov_30`/ `population`,
`death_rate` = ifelse('confirmed_Nov_30'>0,
`death_Nov_30`/`confirmed_Nov_30`,
0),
`Biden` = sum(Biden,na.rm=TRUE),
`Trump` = sum(Trump,na.rm=TRUE),
`Biden Support Rate` = `Biden`/(`Biden` + `Trump`),
)
covid_state <- covid_state %>%
mutate(`Support Which Party` = case_when(covid_state$`Biden Support Rate` > 0.5 ~ "Democratic",
covid_state$`Biden Support Rate` < 0.5 ~ "Republican"
))
covid_state = merge(covid_state, health_data, by = "state")
head(covid_state)
## state confirmed_Nov_30 death_Nov_30 population confirmed_rate
## 1 Alabama 249524 3578 4903185 0.05089019
## 2 Alaska 32418 121 731545 0.04431443
## 3 Arizona 326817 6639 7278717 0.04490036
## 4 Arkansas 154865 2502 3017804 0.05131712
## 5 California 1230264 19173 39512223 0.03113629
## 6 Colorado 232878 3037 5758736 0.04043908
## death_rate Biden Trump Biden Support Rate Support Which Party
## 1 0.014339302 849648 1441168 0.3708932 Republican
## 2 0.003732494 0 0 NaN <NA>
## 3 0.020314121 1672143 1661686 0.5015683 Democratic
## 4 0.016156007 420328 757405 0.3568958 Republican
## 5 0.015584460 11109764 6005961 0.6490969 Democratic
## 6 0.013041163 1804352 1364607 0.5693832 Democratic
## health_cost_per_captia
## 1 7281
## 2 11064
## 3 6452
## 4 7408
## 5 7549
## 6 6804
We separate the votes to two groups: support Democratic Party and support Republican Party by checking the Biden Support Rate
.
- Merge election data with confirmed_d3 dataset
confirmed_d3_all <- merge(confirmed_d3, election, by=c("state","county"),all.x = TRUE,all.y = FALSE)
confirmed_d3 = confirmed_d3_all %>%
group_by(`state`) %>%
summarize(`confirmed_Nov_30` = sum(X11.30.20),
`confirmed_Sep_30` = sum(X9.30.20),
`confirmed_July_30` = sum(X7.30.20),
`confirmed_May_30` = sum(X5.30.20),
`confirmed_Mar_30` = sum(X3.30.20),
`population` = sum(Population),
`Nov_rate` =`confirmed_Nov_30`/ `population`,
`Sep_rate` =`confirmed_Sep_30`/ `population`,
`July_rate` =`confirmed_July_30`/ `population`,
`May_rate` =`confirmed_May_30`/ `population`,
`March_rate` =`confirmed_Mar_30`/ `population`,
`Biden` = sum(Biden,na.rm=TRUE),
`Trump` = sum(Trump,na.rm=TRUE),
`Biden Support Rate` = `Biden`/(`Biden` + `Trump`),
)
confirmed_d3 <- confirmed_d3 %>%
mutate(`Support Which Party` = case_when(confirmed_d3$`Biden Support Rate` > 0.5 ~ "#407abb",
confirmed_d3$`Biden Support Rate` < 0.5 ~ "#c03735"
))
confirmed_d3 = merge(confirmed_d3, health_data, by = "state")
confirmed_d3$state = state.abb[match(confirmed_d3$state,state.name)]
head(confirmed_d3)
## state confirmed_Nov_30 confirmed_Sep_30 confirmed_July_30 confirmed_May_30
## 1 AL 249524 154772 86664 17689
## 2 AK 32576 8845 3605 455
## 3 AZ 326817 218507 170798 19258
## 4 AR 157359 83697 41759 7013
## 5 CA 1230264 819115 492934 109895
## 6 CO 232905 70536 46204 26098
## confirmed_Mar_30 population Nov_rate Sep_rate July_rate May_rate
## 1 1001 4903185 0.05089019 0.03156560 0.017675042 0.0036076550
## 2 119 731545 0.04453041 0.01209085 0.004927927 0.0006219713
## 3 1157 7278717 0.04490036 0.03001999 0.023465399 0.0026457960
## 4 473 3017804 0.05214355 0.02773441 0.013837545 0.0023238752
## 5 7155 39512223 0.03113629 0.02073067 0.012475481 0.0027812912
## 6 2627 5758736 0.04044377 0.01224852 0.008023288 0.0045318973
## March_rate Biden Trump Biden Support Rate Support Which Party
## 1 0.0002041530 849648 1441168 0.3708932 #c03735
## 2 0.0001626694 0 0 NaN <NA>
## 3 0.0001589566 1672143 1661686 0.5015683 #407abb
## 4 0.0001567365 420328 757405 0.3568958 #c03735
## 5 0.0001810832 11109764 6005961 0.6490969 #407abb
## 6 0.0004561765 1804352 1364607 0.5693832 #407abb
## health_cost_per_captia
## 1 7281
## 2 11064
## 3 6452
## 4 7408
## 5 7549
## 6 6804
- Merge new data frame: death_per_100 with covid_state data
death_per_100 = covid_merge_all %>%
group_by(`state`) %>%
summarize(
`death_Nov_30` = sum(death_Nov_30),
`population` = sum(population),
`death_per_100` = `death_Nov_30`/`population`*100) %>%
select(`state`, `death_per_100`)
covid_state = merge(covid_state, death_per_100, by='state')
covid_state$death_per_1M = covid_state$death_per_100 * 10000
head(covid_state)
## state confirmed_Nov_30 death_Nov_30 population confirmed_rate
## 1 Alabama 249524 3578 4903185 0.05089019
## 2 Alaska 32418 121 731545 0.04431443
## 3 Arizona 326817 6639 7278717 0.04490036
## 4 Arkansas 154865 2502 3017804 0.05131712
## 5 California 1230264 19173 39512223 0.03113629
## 6 Colorado 232878 3037 5758736 0.04043908
## death_rate Biden Trump Biden Support Rate Support Which Party
## 1 0.014339302 849648 1441168 0.3708932 Republican
## 2 0.003732494 0 0 NaN <NA>
## 3 0.020314121 1672143 1661686 0.5015683 Democratic
## 4 0.016156007 420328 757405 0.3568958 Republican
## 5 0.015584460 11109764 6005961 0.6490969 Democratic
## 6 0.013041163 1804352 1364607 0.5693832 Democratic
## health_cost_per_captia death_per_100 death_per_1M
## 1 7281 0.07297298 729.7298
## 2 11064 0.01654034 165.4034
## 3 6452 0.09121113 912.1113
## 4 7408 0.08290797 829.0797
## 5 7549 0.04852423 485.2423
## 6 6804 0.05273727 527.3727
We create a new column: death per 100 by calculating the death case divided by population.