Chapter 3 Data transformation

3.1 Library Package

library(dplyr)
library(readr)

3.2 health data

Firstly, we read the health data from cms government website.

read_health_data <- function() {
  url = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/NationalHealthExpendData/Downloads/resident-state-estimates.zip'
  temp <- tempfile()
  temp2 <- tempfile()
  # download the zipfile 
  download.file(url, temp)
  # unzip the file 
  unzip(zipfile = temp, exdir = temp2)
  # read_csv, only select the state and 2014 column 
  health_per_capita <- read_csv(file.path(temp2, "US_PER_CAPITA14.CSV")) %>% 
    filter(`Item` == 'Personal Health Care ($)') %>% 
    select('State_Name', 'Y2014') %>%
    na.omit()
  names(health_per_capita) = c("state", "health_cost_per_captia")
  return(health_per_capita)
}

health_data <- read_health_data()
head(health_data)
## # A tibble: 6 x 2
##   state      health_cost_per_captia
##   <chr>                       <dbl>
## 1 Alabama                      7281
## 2 Alaska                      11064
## 3 Arizona                      6452
## 4 Arkansas                     7408
## 5 California                   7549
## 6 Colorado                     6804

3.3 covid data

Secondly, we read the covid_19 data before November 30, 2020 and change the column name to make it more readable.

url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"

confirmed_df = read.csv(paste0(url, "time_series_covid19_confirmed_US.csv"))
# colnames(confirmed_df) = gsub("X","", colnames(confirmed_df))
death_df = read.csv(paste0(url, "time_series_covid19_deaths_US.csv"))
# colnames(death_df) = gsub("X","", colnames(death_df))

# function for transform covid time series data
transform_confirmed <- function(df) {
  info_col = df[,c("UID","Admin2","Province_State","Lat","Long_")]
  Nov_30 = df["X11.30.20"]
  df = cbind(info_col, Nov_30)
  # rename columns 
  ColName = c("UID","county","state","lat", "long", paste0('confirmed', "_Nov_30"))
  for (i in 1:6){
    names(df)[i] = paste(ColName[i])
  }
  return(df)
}

# population column is in the death time series data, use another method 
transform_death <- function(df) {
  info_col = df[,c("UID","Population")]
  Nov_30 = df["X11.30.20"]
  df = cbind(info_col, Nov_30)
  # rename columns 
  ColName = c("UID","population", paste0('death', "_Nov_30"))
  for (i in 1:3){
    names(df)[i] = paste(ColName[i])
  }
  return(df)
}

# transform the two dataframe for Nov 30th only
confirmed_df_Nov_30 = transform_confirmed(confirmed_df)
death_df_Nov_30 = transform_death(death_df)


# merge confirmed and death by UID
covid_merged = merge(confirmed_df_Nov_30, death_df_Nov_30, by = "UID") # check:3340 rows, 8 columns

# add confirm_rate column
covid_merged$confirmed_rate = covid_merged$confirmed_Nov_30 / covid_merged$population 
# get death_rate: death / confirmed 
#covid_merged$death_Nov_30 = as.numeric(covid_merged$death_Nov_30)
covid_merged$death_rate = ifelse(covid_merged$confirmed_Nov_30>0, 
                                 covid_merged$death_Nov_30/covid_merged$confirmed_Nov_30,
                                 0)

# if population == 0 remove this row
covid_merged = covid_merged[covid_merged['population']!=0,]
head(covid_merged)
##        UID   county                    state       lat       long
## 1       16                    American Samoa -14.27100 -170.13200
## 2      316                              Guam  13.44430  144.79370
## 3      580          Northern Mariana Islands  15.09790  145.67390
## 4      850                    Virgin Islands  18.33580  -64.89630
## 5 63072001 Adjuntas              Puerto Rico  18.18012  -66.75437
## 6 63072003   Aguada              Puerto Rico  18.36025  -67.17513
##   confirmed_Nov_30 population death_Nov_30 confirmed_rate death_rate
## 1                0      55641            0    0.000000000 0.00000000
## 2             6852     164229          112    0.041722229 0.01634559
## 3              106      55144            2    0.001922240 0.01886792
## 4             1544     107268           23    0.014393855 0.01489637
## 5              159      19483            0    0.008160961 0.00000000
## 6              428      41959            0    0.010200434 0.00000000

We want to find the relationship between covid_19 and election through time. So, we generate new dataframe confirmed_d3 and confirmed_TS.

transform_confirmed_d3 <- function(df) {
  info_col = df[,c("UID","Admin2","Province_State","Lat","Long_")]
  dates = df[,c("X3.30.20","X5.30.20","X7.30.20","X9.30.20","X11.30.20")]
  df = cbind(info_col, dates)
  # rename columns 
  ColName = c("UID","county","state","lat", "long")
  for (i in 1:5){
    names(df)[i] = paste(ColName[i])
  }
  return(df)
}

confirmed_d3 = transform_confirmed_d3(confirmed_df)
subdeath = subset(death_df,select = c("UID","Population"))
confirmed_d3 = merge(confirmed_d3,subdeath, by="UID")

# cr means confirmed rate 
confirmed_d3$cr3 = confirmed_d3$X3.30.20/confirmed_d3$Population
confirmed_d3$cr5 = confirmed_d3$X5.30.20/confirmed_d3$Population
confirmed_d3$cr7 = confirmed_d3$X7.30.20/confirmed_d3$Population
confirmed_d3$cr9 = confirmed_d3$X9.30.20/confirmed_d3$Population
confirmed_d3$cr11 = confirmed_d3$X11.30.20/confirmed_d3$Population
head(confirmed_d3)
##        UID   county                    state       lat       long X3.30.20
## 1       16                    American Samoa -14.27100 -170.13200        0
## 2      316                              Guam  13.44430  144.79370       58
## 3      580          Northern Mariana Islands  15.09790  145.67390        0
## 4      850                    Virgin Islands  18.33580  -64.89630       30
## 5 63072001 Adjuntas              Puerto Rico  18.18012  -66.75437        0
## 6 63072003   Aguada              Puerto Rico  18.36025  -67.17513        0
##   X5.30.20 X7.30.20 X9.30.20 X11.30.20 Population          cr3          cr5
## 1        0        0        0         0      55641 0.0000000000 0.0000000000
## 2      172      356     2488      6852     164229 0.0003531654 0.0010473181
## 3       22       42       70       106      55144 0.0000000000 0.0003989555
## 4       69      385     1318      1544     107268 0.0002796733 0.0006432487
## 5        7       29       77       159      19483 0.0000000000 0.0003592876
## 6       10       36      172       428      41959 0.0000000000 0.0002383279
##            cr7         cr9        cr11
## 1 0.0000000000 0.000000000 0.000000000
## 2 0.0021677049 0.015149578 0.041722229
## 3 0.0007616422 0.001269404 0.001922240
## 4 0.0035891412 0.012286982 0.014393855
## 5 0.0014884771 0.003952163 0.008160961
## 6 0.0008579804 0.004099240 0.010200434
confirmed_TS <- confirmed_df %>% select(Combined_Key,UID,Province_State, c(`X1.22.20`:`X11.30.20`)) 
death_TS <- death_df %>% select(Combined_Key, UID,Province_State, Population, c(`X1.22.20`:`X11.30.20`))
confirmed_TS_merge = merge(confirmed_df, death_df%>%select(UID, Population), by = "UID")
confirmed_TS_merge = confirmed_TS_merge[confirmed_TS_merge['Population'] != 0, ]

3.4 election data

The last dataset we want to consider is the 2020 election data in USA.

url_election = "https://raw.githubusercontent.com/JessieYee/EDAV_Covid19_Research/main/election2020.csv"

election = read.csv(url_election)
#election$Combined_Key = paste0(election$County, ", ", election$State, ", US")
election[] <- lapply(election, gsub, pattern=',', replacement='')
election["state"]<-lapply(election["state"], gsub, pattern='-', replacement=' ')
election$Biden = as.numeric(election$Biden)
election$Trump = as.numeric(election$Trump)
election$Biden_support_rate = election$Biden / (election$Biden+election$Trump)
head(election)
##     state  county Biden Trump Biden_support_rate
## 1 Alabama Autauga  7503 19838         0.27442303
## 2 Alabama Baldwin 24578 83544         0.22731729
## 3 Alabama Barbour  4816  5622         0.46139107
## 4 Alabama    Bibb  1986  7525         0.20881085
## 5 Alabama  Blount  2640 24711         0.09652298
## 6 Alabama Bullock  3446  1146         0.75043554

We generate new variable: Biden_support_rate to calculate the support rate of Biden. For Biden_support_rate>0.5, we can conclude that more than half of people in this state support Biden. For Biden_support_rate<0.5, we can conclude that more than half of people in this state support Trump.

3.5 merge data

  1. left join the covid_19 data and election
covid_merge_all <- merge(covid_merged, election, by=c("state","county"),all.x = TRUE,all.y = FALSE)

covid_state = covid_merge_all %>% 
  group_by(`state`) %>% 
  summarize(`confirmed_Nov_30` = sum(confirmed_Nov_30),
            `death_Nov_30` = sum(death_Nov_30),
            `population` = sum(population),
            `confirmed_rate` =`confirmed_Nov_30`/ `population`,
            `death_rate` = ifelse('confirmed_Nov_30'>0, 
                                  `death_Nov_30`/`confirmed_Nov_30`,
                                  0),
            `Biden` = sum(Biden,na.rm=TRUE),
            `Trump` = sum(Trump,na.rm=TRUE),
            `Biden Support Rate` = `Biden`/(`Biden` + `Trump`),
  )

covid_state <- covid_state %>% 
  mutate(`Support Which Party` = case_when(covid_state$`Biden Support Rate` > 0.5 ~ "Democratic", 
                                           covid_state$`Biden Support Rate` < 0.5 ~ "Republican"
  ))
covid_state = merge(covid_state, health_data, by = "state")
head(covid_state)
##        state confirmed_Nov_30 death_Nov_30 population confirmed_rate
## 1    Alabama           249524         3578    4903185     0.05089019
## 2     Alaska            32418          121     731545     0.04431443
## 3    Arizona           326817         6639    7278717     0.04490036
## 4   Arkansas           154865         2502    3017804     0.05131712
## 5 California          1230264        19173   39512223     0.03113629
## 6   Colorado           232878         3037    5758736     0.04043908
##    death_rate    Biden   Trump Biden Support Rate Support Which Party
## 1 0.014339302   849648 1441168          0.3708932          Republican
## 2 0.003732494        0       0                NaN                <NA>
## 3 0.020314121  1672143 1661686          0.5015683          Democratic
## 4 0.016156007   420328  757405          0.3568958          Republican
## 5 0.015584460 11109764 6005961          0.6490969          Democratic
## 6 0.013041163  1804352 1364607          0.5693832          Democratic
##   health_cost_per_captia
## 1                   7281
## 2                  11064
## 3                   6452
## 4                   7408
## 5                   7549
## 6                   6804

We separate the votes to two groups: support Democratic Party and support Republican Party by checking the Biden Support Rate.

  1. Merge election data with confirmed_d3 dataset
confirmed_d3_all <- merge(confirmed_d3, election, by=c("state","county"),all.x = TRUE,all.y = FALSE)

confirmed_d3 = confirmed_d3_all %>% 
  group_by(`state`) %>% 
  summarize(`confirmed_Nov_30` = sum(X11.30.20),
            `confirmed_Sep_30` = sum(X9.30.20),
            `confirmed_July_30` = sum(X7.30.20),
            `confirmed_May_30` = sum(X5.30.20),
            `confirmed_Mar_30` = sum(X3.30.20),
            `population` = sum(Population),
            `Nov_rate` =`confirmed_Nov_30`/ `population`,
            `Sep_rate` =`confirmed_Sep_30`/ `population`,
            `July_rate` =`confirmed_July_30`/ `population`,
            `May_rate` =`confirmed_May_30`/ `population`,
            `March_rate` =`confirmed_Mar_30`/ `population`,
            `Biden` = sum(Biden,na.rm=TRUE),
            `Trump` = sum(Trump,na.rm=TRUE),
            `Biden Support Rate` = `Biden`/(`Biden` + `Trump`),
  )

confirmed_d3 <- confirmed_d3 %>% 
  mutate(`Support Which Party` = case_when(confirmed_d3$`Biden Support Rate` > 0.5 ~ "#407abb", 
                                           confirmed_d3$`Biden Support Rate` < 0.5 ~ "#c03735"
  ))
confirmed_d3 = merge(confirmed_d3, health_data, by = "state")

confirmed_d3$state = state.abb[match(confirmed_d3$state,state.name)]
head(confirmed_d3)
##   state confirmed_Nov_30 confirmed_Sep_30 confirmed_July_30 confirmed_May_30
## 1    AL           249524           154772             86664            17689
## 2    AK            32576             8845              3605              455
## 3    AZ           326817           218507            170798            19258
## 4    AR           157359            83697             41759             7013
## 5    CA          1230264           819115            492934           109895
## 6    CO           232905            70536             46204            26098
##   confirmed_Mar_30 population   Nov_rate   Sep_rate   July_rate     May_rate
## 1             1001    4903185 0.05089019 0.03156560 0.017675042 0.0036076550
## 2              119     731545 0.04453041 0.01209085 0.004927927 0.0006219713
## 3             1157    7278717 0.04490036 0.03001999 0.023465399 0.0026457960
## 4              473    3017804 0.05214355 0.02773441 0.013837545 0.0023238752
## 5             7155   39512223 0.03113629 0.02073067 0.012475481 0.0027812912
## 6             2627    5758736 0.04044377 0.01224852 0.008023288 0.0045318973
##     March_rate    Biden   Trump Biden Support Rate Support Which Party
## 1 0.0002041530   849648 1441168          0.3708932             #c03735
## 2 0.0001626694        0       0                NaN                <NA>
## 3 0.0001589566  1672143 1661686          0.5015683             #407abb
## 4 0.0001567365   420328  757405          0.3568958             #c03735
## 5 0.0001810832 11109764 6005961          0.6490969             #407abb
## 6 0.0004561765  1804352 1364607          0.5693832             #407abb
##   health_cost_per_captia
## 1                   7281
## 2                  11064
## 3                   6452
## 4                   7408
## 5                   7549
## 6                   6804
  1. Merge new data frame: death_per_100 with covid_state data
death_per_100 = covid_merge_all %>% 
  group_by(`state`) %>% 
  summarize(
    `death_Nov_30` = sum(death_Nov_30),
    `population` = sum(population),
    `death_per_100` = `death_Nov_30`/`population`*100) %>%
  select(`state`, `death_per_100`)
covid_state = merge(covid_state, death_per_100, by='state')
covid_state$death_per_1M = covid_state$death_per_100 * 10000
head(covid_state)
##        state confirmed_Nov_30 death_Nov_30 population confirmed_rate
## 1    Alabama           249524         3578    4903185     0.05089019
## 2     Alaska            32418          121     731545     0.04431443
## 3    Arizona           326817         6639    7278717     0.04490036
## 4   Arkansas           154865         2502    3017804     0.05131712
## 5 California          1230264        19173   39512223     0.03113629
## 6   Colorado           232878         3037    5758736     0.04043908
##    death_rate    Biden   Trump Biden Support Rate Support Which Party
## 1 0.014339302   849648 1441168          0.3708932          Republican
## 2 0.003732494        0       0                NaN                <NA>
## 3 0.020314121  1672143 1661686          0.5015683          Democratic
## 4 0.016156007   420328  757405          0.3568958          Republican
## 5 0.015584460 11109764 6005961          0.6490969          Democratic
## 6 0.013041163  1804352 1364607          0.5693832          Democratic
##   health_cost_per_captia death_per_100 death_per_1M
## 1                   7281    0.07297298     729.7298
## 2                  11064    0.01654034     165.4034
## 3                   6452    0.09121113     912.1113
## 4                   7408    0.08290797     829.0797
## 5                   7549    0.04852423     485.2423
## 6                   6804    0.05273727     527.3727

We create a new column: death per 100 by calculating the death case divided by population.