Data Cleaning¶
In this notebook, we'll
load the raw data into R or Pandas dataframe from the
./data/raw
directory using the functions and classes (written during the data gathering phase) in the./scripts
directoryperform visual exploration on your data variables to detect anomalies, errors, outliers, interesting features etc.
clean categorical and quantitative variables (by removing observations and variables with too many missing values, by consolidating categorical variables, by selecting an appropriate subset of variables and observations for later analysis, etc.)
save the cleaned data into csv files in the
./data/cleaned
directorysave interesting graphics we obtained during visual exploration into the
./visualiation
directory
Cleaning the HIV Data¶
First, we'll need to load the data again using the functions defined in the previous notebook.
%load_ext rmagic
%%R
source('./scripts/hiv_loader.R')
hiv_diagnoses = get_hiv_diagnoses_df()
citizenship_status = get_citizenship_status_df()
fpl = get_fpl_df()
income = get_income_df()
unemployment = get_unemployment_df()
insured = get_insured_df()
hospital = get_hospital_df()
federal_funds = get_federal_funds_df()
test_rate = get_test_rate_df()
sex_ed_6_9 = get_sex_ed_6_8_df()
sex_ed_9_12 = get_sex_ed_9_12_df()
chlamydia = get_chlamydia_df()
gonorrhea = get_gonorrhea_df()
syphilis = get_syphilis_df()
Next, we'll filter the Location
field so that we limit our analysis to the 50 US states. We'll also reindex the dataframe.
%%R
states = c('Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming')
hiv_diagnoses = hiv_diagnoses[hiv_diagnoses$Location %in% states, ]
row.names(hiv_diagnoses) = 1:nrow(hiv_diagnoses)
citizenship_status = citizenship_status[citizenship_status$Location %in% states, ]
row.names(citizenship_status) = 1:nrow(citizenship_status)
fpl = fpl[fpl$Location %in% states, ]
row.names(fpl) = 1:nrow(fpl)
income = income[income$Location %in% states, ]
row.names(income) = 1:nrow(income)
unemployment = unemployment[unemployment$Location %in% states, ]
row.names(unemployment) = 1:nrow(unemployment)
insured = insured[insured$Location %in% states, ]
row.names(insured) = 1:nrow(insured)
hospital = hospital[hospital$Location %in% states, ]
row.names(hospital) = 1:nrow(hospital)
federal_funds = federal_funds[federal_funds$Location %in% states, ]
row.names(federal_funds) = 1:nrow(federal_funds)
test_rate = test_rate[test_rate$Location %in% states, ]
row.names(test_rate) = 1:nrow(test_rate)
sex_ed_6_9 = sex_ed_6_9[sex_ed_6_9$Location %in% states, ]
row.names(sex_ed_6_9) = 1:nrow(sex_ed_6_9)
sex_ed_9_12 = sex_ed_9_12[sex_ed_9_12$Location %in% states, ]
row.names(sex_ed_9_12) = 1:nrow(sex_ed_9_12)
chlamydia = chlamydia[chlamydia$Location %in% states, ]
row.names(chlamydia) = 1:nrow(chlamydia)
gonorrhea = gonorrhea[gonorrhea$Location %in% states, ]
row.names(gonorrhea) = 1:nrow(gonorrhea)
syphilis = syphilis[syphilis$Location %in% states, ]
row.names(syphilis) = 1:nrow(syphilis)
Dealing with Missing Values¶
We conformed data to NA
in citizenship_status
and added NA
values for Wyoming, which was missing completely.
%%R
citizenship_status[citizenship_status == 'NSD'] = NA
insured[insured == 'NSD'] = NA
if (nrow(citizenship_status) != 50) {
citizenship_status = rbind(citizenship_status, c('Wyoming', NA, NA, NA))
}
Normalizing Data Values¶
To make things easier later, we'll use state abbreviations rather than their full names.
%%R
abbreviations = c('AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY')
hiv_diagnoses$Location = abbreviations
citizenship_status$Location = abbreviations
fpl$Location = abbreviations
income$Location = abbreviations
unemployment$Location = abbreviations
insured$Location = abbreviations
hospital$Location = abbreviations
federal_funds$Location = abbreviations
test_rate$Location = abbreviations
sex_ed_6_9$Location = abbreviations
sex_ed_9_12$Location = abbreviations
chlamydia$Location = abbreviations
gonorrhea$Location = abbreviations
syphilis$Location = abbreviations
%%R
file.create(c('./data/cleaned/HIV_Diagnoses.csv',
'./data/cleaned/Citizenship_Status.csv',
'./data/cleaned/fpl.csv',
'./data/cleaned/Median_Annual_Income.csv',
'./data/cleaned/unemployment_rate.csv',
'./data/cleaned/Has_Insurance.csv',
'./data/cleaned/Hospital_Owners.csv',
'./data/cleaned/Federal_Funds.csv',
'./data/cleaned/Test_Rate.csv',
'./data/cleaned/Sex_Ed_6_8.csv',
'./data/cleaned/Sex_Ed_9_12.csv',
'./data/cleaned/chlamydia_cases.csv',
'./data/cleaned/gonorrhea_cases.csv',
'./data/cleaned/syphilis_cases.csv'))
write.csv(hiv_diagnoses, './data/cleaned/HIV_Diagnoses.csv', row.names=F)
write.csv(citizenship_status, './data/cleaned/Citizenship_Status.csv', row.names=F)
write.csv(fpl, './data/cleaned/fpl.csv', row.names=F)
write.csv(income, './data/cleaned/Median_Annual_Income.csv', row.names=F)
write.csv(unemployment, './data/cleaned/unemployment_rate.csv', row.names=F)
write.csv(insured, './data/cleaned/Has_Insurance.csv', row.names=F)
write.csv(hospital, './data/cleaned/Hospital_Owners.csv', row.names=F)
write.csv(federal_funds, './data/cleaned/Federal_Funds.csv', row.names=F)
write.csv(test_rate, './data/cleaned/Test_Rate.csv', row.names=F)
write.csv(sex_ed_6_9, './data/cleaned/Sex_Ed_6_8.csv', row.names=F)
write.csv(sex_ed_9_12, './data/cleaned/Sex_Ed_9_12.csv', row.names=F)
write.csv(chlamydia, './data/cleaned/chlamydia_cases.csv', row.names=F)
write.csv(gonorrhea, './data/cleaned/gonorrhea_cases.csv', row.names=F)
write.csv(syphilis, './data/cleaned/syphilis_cases.csv', row.names=F)
A Helper Function for Future Parts¶
Whenever possible, we'll try to keep things normalized to 1, since many of these fields are percentages. We'll also need to convert everything to a numeric
type.
%%file ./scripts/hiv_loader_clean.R
options(stringsAsFactors=FALSE)
get_all_data = function() {
all_data = matrix()
hiv_diagnoses = read.csv('./data/cleaned/HIV_Diagnoses.csv')
citizenship_status = read.csv('./data/cleaned/Citizenship_Status.csv')
fpl = read.csv('./data/cleaned/fpl.csv')
income = read.csv('./data/cleaned/Median_Annual_Income.csv')
unemployment = read.csv('./data/cleaned/unemployment_rate.csv')
insured = read.csv('./data/cleaned/Has_Insurance.csv')
hospital = read.csv('./data/cleaned/Hospital_Owners.csv')
federal_funds = read.csv('./data/cleaned/Federal_Funds.csv')
test_rate = read.csv('./data/cleaned/Test_Rate.csv')
sex_ed_6_9 = read.csv('./data/cleaned/Sex_Ed_6_8.csv')
sex_ed_9_12 = read.csv('./data/cleaned/Sex_Ed_9_12.csv')
chlamydia = read.csv('./data/cleaned/chlamydia_cases.csv')
gonorrhea = read.csv('./data/cleaned/gonorrhea_cases.csv')
syphilis = read.csv('./data/cleaned/syphilis_cases.csv')
hiv_diagnoses[, -1] = sapply(hiv_diagnoses[, -1], as.numeric)
citizenship_status[, -1] = sapply(citizenship_status[, -1], as.numeric)
fpl[, -1] = sapply(fpl[, -1], as.numeric)
income[, -1] = sapply(income[, -1], as.numeric)
unemployment[, -1] = sapply(unemployment[, -1], as.numeric)
insured[, -1] = sapply(insured[, -1], as.numeric)
hospital[, -1] = sapply(hospital[, -1], as.numeric)
federal_funds[, -1] = sapply(federal_funds[, -1], as.numeric)
test_rate[, -1] = sapply(test_rate[, -1], as.numeric)
sex_ed_6_9[, -1] = sapply(sex_ed_6_9[, -1], as.numeric)
sex_ed_9_12[, -1] = sapply(sex_ed_9_12[, -1], as.numeric)
chlamydia[, -1] = sapply(chlamydia[, -1], as.numeric)
gonorrhea[, -1] = sapply(gonorrhea[, -1], as.numeric)
syphilis[, -1] = sapply(syphilis[, -1], as.numeric)
# Percentages are probably much more useful here than raw numbers
hospital[, -1] = hospital[, -1] / 100
sex_ed_6_9[, -1] = sex_ed_6_9[, -1] / 100
sex_ed_9_12[, -1] = sex_ed_9_12[, -1] / 100
federal_funds[, -1] = federal_funds[, -1] / 100000000
all_data$hiv_diagnoses = hiv_diagnoses
all_data$citizenship_status = citizenship_status
all_data$fpl = fpl
all_data$income = income
all_data$unemployment = unemployment
all_data$insured = insured
all_data$hospital = hospital
all_data$federal_funds = federal_funds
all_data$test_rate = test_rate
all_data$sex_ed_6_9 = sex_ed_6_9
all_data$sex_ed_9_12 = sex_ed_9_12
all_data$chlamydia = chlamydia
all_data$gonorrhea = gonorrhea
all_data$syphilis = syphilis
print('Loaded: hiv_diagnoses, citizenship_status, fpl, income, unemployment, insured, hospital, federal_funds, test_rate, sex_ed_6_9, sex_ed_9_12, chlamydia, gonorrhea, syphilis')
return(all_data)
}