library(dplyr)
library(ggplot2)
nyc_census_info <- read.csv("./2020_Census_Tracts_-_Tabular.csv", header = TRUE)
atlas2 <- read.csv("./atlas_modified.csv", header = TRUE)
nyc <- subset(atlas2, state == 36 & county %in% c(47, 61, 85, 81, 5))
nyc$BoroName <- ifelse(nyc$county == 47, "Brooklyn",
ifelse(nyc$county == 61, "Manhattan",
ifelse(nyc$county == 85, "Staten Island",
ifelse(nyc$county == 81, "Queens",
ifelse(nyc$county == 5, "Bronx", NA)))))
nyc_neigborhood_info <- select(nyc_census_info, BoroName, CT2020, NTAName)
nyc_neigborhood_info <- rename(nyc_neigborhood_info, tract = CT2020)
nyc2 <- merge(nyc, nyc_neigborhood_info[,c("BoroName", "tract", "NTAName")], by = c("BoroName", "tract"), all.x = TRUE)
names(nyc2)[which(names(nyc2) == "NTAName")] <- "neighborhood"
south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))
riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")
First Violin Plot
south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))
riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")
# Combine the two data sets into a single data frame
combined_data <- data.frame(
value = c(south_bronx_bronx$kfr_pooled_pooled_p25, riverdale_bronx$kfr_pooled_pooled_p25),
group = c(rep("South Bronx Bronx", length(south_bronx_bronx$kfr_pooled_pooled_p25)),
rep("Riverdale Bronx", length(riverdale_bronx$kfr_pooled_pooled_p25))))
# Generate the violin plot
ggplot(combined_data, aes(x = group, y = value)) +
geom_violin() +
geom_jitter(width = 0.2, size = 1, alpha = 0.5) + # Add raw data points
geom_boxplot(width = 0.1, fill = "white") + # Add box-and-whisker plot
labs(x = "Group", y = "Value") +
theme_minimal()
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).
#Second Violin Plot (better design)
# Load required library
library(ggplot2)
south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))
riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")
# Combine the two data sets into a single data frame
combined_data <- data.frame(
value = c(south_bronx_bronx$kfr_pooled_pooled_p25, riverdale_bronx$kfr_pooled_pooled_p25),
group = c(rep("South Bronx", length(south_bronx_bronx$kfr_pooled_pooled_p25)),
rep("Riverdale", length(riverdale_bronx$kfr_pooled_pooled_p25))))
# Generate the violin plot
ggplot(combined_data, aes(x = group, y = value, fill = group)) +
geom_violin(alpha = 0.5) +
geom_jitter(width = 0.2, size = 1, alpha = 0.5, color = "black") +
geom_boxplot(width = 0.1, fill = "white", alpha = 0.5, outlier.shape = NA) +
labs(x = NULL,
y = "Value",
title = "kfr_pooled_pooled_p25",
fill = NULL) +
theme_minimal() +
theme(legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5), # Center the title
axis.line = element_line(color = 'black')) # Add x and y axis lines
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).
Violing Plots – all kfr
#Violing Plots -- all kfr
# Load required libraries
library(tidyverse)
library(tidyr) # Load tidyr specifically
south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))
riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")
# List of variables to plot
variables <- c("kfr_pooled_pooled_p25", "kfr_black_pooled_p25", "kfr_hisp_pooled_p25", "kfr_white_pooled_p25", "kfr_pooled_female_p25", "kfr_pooled_male_p25", "kfr_black_female_p25", "kfr_hisp_female_p25", "kfr_white_female_p25", "kfr_black_male_p25", "kfr_hisp_male_p25", "kfr_white_male_p25")
# Function to reshape data
reshape_data <- function(data, group_name) {
data %>%
select(one_of(variables)) %>% # select required variables
gather(key = "variable", value = "value") %>% # convert to long format
mutate(group = group_name) # add group column
}
# Reshape data
south_bronx_bronx <- reshape_data(south_bronx_bronx, "South Bronx")
riverdale_bronx <- reshape_data(riverdale_bronx, "Riverdale")
# Combine the two data sets into a single data frame
combined_data <- rbind(south_bronx_bronx, riverdale_bronx)
# Filter out missing values
combined_data <- combined_data %>% filter(!is.na(value))
# Define the desired order
x_order <- c("kfr_pooled_pooled_p25_Riverdale", "kfr_pooled_pooled_p25_South Bronx", "kfr_white_pooled_p25_Riverdale", "kfr_white_pooled_p25_South Bronx", "kfr_black_pooled_p25_Riverdale", "kfr_black_pooled_p25_South Bronx", "kfr_hisp_pooled_p25_Riverdale", "kfr_hisp_pooled_p25_South Bronx", "kfr_pooled_female_p25_Riverdale", "kfr_pooled_female_p25_South Bronx", "kfr_pooled_male_p25_Riverdale", "kfr_pooled_male_p25_South Bronx", "kfr_white_female_p25_Riverdale", "kfr_white_female_p25_South Bronx", "kfr_white_male_p25_Riverdale", "kfr_white_male_p25_South Bronx", "kfr_black_female_p25_Riverdale", "kfr_black_female_p25_South Bronx", "kfr_black_male_p25_Riverdale", "kfr_black_male_p25_South Bronx", "kfr_hisp_female_p25_Riverdale", "kfr_hisp_female_p25_South Bronx", "kfr_hisp_male_p25_Riverdale", "kfr_hisp_male_p25_South Bronx")
# Update the variable_group column in combined_data to an ordered factor with the levels specified in x_order
combined_data$variable_group <- factor(paste(combined_data$variable, combined_data$group, sep = "_"), levels = x_order)
# Generate the violin plot
plot <- ggplot(combined_data, aes(x = variable_group, y = value, fill = group)) +
geom_violin(alpha = 0.5, position = position_dodge(0.8)) +
geom_jitter(width = 0.2, size = 1, alpha = 0.7) + # Add raw data points
geom_boxplot(width = 0.2, fill = "white", position = position_dodge(0.8), outlier.shape = NA) +
labs(x = NULL,
y = "Value",
fill = NULL,
title = "Social Mobility: Riverdale vs. South Bronx") +
theme_minimal() +
theme(legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1), # Rotate x-axis labels for readability
plot.title = element_text(hjust = 0.5), # Center the title
axis.line = element_line(color = 'black')) # Add x and y axis lines
# Display the plot
print(plot)
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
Correlate riverdale_bronx$kfr_pooled_pooled_p25 to all relevant vars
# Variables to correlate with kfr_pooled_pooled_p25
south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))
riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")
# List of variables to correlate with kfr_pooled_pooled_p25
vars <- c("hhinc_mean2000", "mean_commutetime2000", "frac_coll_plus2010", "frac_coll_plus2000", "foreign_share2010", "med_hhinc2016", "med_hhinc1990", "popdensity2000", "poor_share2010", "poor_share2000", "poor_share1990", "share_black2010", "share_hisp2010", "share_asian2010", "share_black2000", "share_white2000", "share_hisp2000", "share_asian2000", "gsmn_math_g3_2013", "rent_twobed2015", "singleparent_share2010", "singleparent_share1990", "singleparent_share2000", "traveltime15_2010", "emp2000", "mail_return_rate2010", "ln_wage_growth_hs_grad", "jobs_total_5mi_2015", "jobs_highpay_5mi_2015", "nonwhite_share2010", "popdensity2010", "ann_avg_job_growth_2004_2013", "job_density_2013")
# Calculate the correlations
cor_values <- sapply(vars, function(v) cor(riverdale_bronx$kfr_pooled_pooled_p25, riverdale_bronx[[v]], use="complete.obs"))
## Warning in cor(riverdale_bronx$kfr_pooled_pooled_p25, riverdale_bronx[[v]], :
## the standard deviation is zero
# Create a dataframe with the correlations
cor_df <- data.frame(Variable = vars, Correlation = cor_values)
# Order the dataframe by absolute value of correlation
cor_df <- cor_df[order(abs(cor_df$Correlation), decreasing = TRUE), ]
# Use ggplot2 to create the heatmap
library(ggplot2)
ggplot(data = cor_df, aes(x = 1, y = reorder(Variable, Correlation), fill = Correlation)) +
geom_tile(color = "white") +
geom_text(aes(label = round(Correlation, 2)), color = "black", size = 4) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limits = c(-1, 1)) +
labs(y = "", x = "", title = "Correlation with kfr_pooled_pooled_p25") +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.ticks = element_blank())
## Warning: Removed 1 rows containing missing values (geom_text).
Add south_bronx_bronx$kfr_pooled_pooled_p25 correlated to all relevant vars
south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))
riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")
# List of variables to correlate with kfr_pooled_pooled_p25
vars <- c("hhinc_mean2000", "mean_commutetime2000", "frac_coll_plus2010", "frac_coll_plus2000", "foreign_share2010", "med_hhinc2016", "med_hhinc1990", "popdensity2000", "poor_share2010", "poor_share2000", "poor_share1990", "share_black2010", "share_hisp2010", "share_asian2010", "share_black2000", "share_white2000", "share_hisp2000", "share_asian2000", "gsmn_math_g3_2013", "rent_twobed2015", "singleparent_share2010", "singleparent_share1990", "singleparent_share2000", "traveltime15_2010", "emp2000", "mail_return_rate2010", "ln_wage_growth_hs_grad", "jobs_total_5mi_2015", "jobs_highpay_5mi_2015", "nonwhite_share2010", "popdensity2010", "ann_avg_job_growth_2004_2013", "job_density_2013")
# Calculate the correlations for both riverdale_bronx and south_bronx_bronx
cor_values_riverdale <- sapply(vars, function(v) cor(riverdale_bronx$kfr_pooled_pooled_p25, riverdale_bronx[[v]], use="complete.obs"))
## Warning in cor(riverdale_bronx$kfr_pooled_pooled_p25, riverdale_bronx[[v]], :
## the standard deviation is zero
cor_values_south_bronx <- sapply(vars, function(v) cor(south_bronx_bronx$kfr_pooled_pooled_p25, south_bronx_bronx[[v]], use="complete.obs"))
## Warning in cor(south_bronx_bronx$kfr_pooled_pooled_p25,
## south_bronx_bronx[[v]], : the standard deviation is zero
# Create a dataframe with the correlations
cor_df <- data.frame(Variable = vars,
Correlation_Riverdale = cor_values_riverdale,
Correlation_South_Bronx = cor_values_south_bronx)
# Reshape data to long format using gather()
cor_df_long <- tidyr::gather(cor_df, key = "Neighborhood", value = "Correlation", -Variable)
# Order the dataframe by absolute value of correlation
cor_df_long <- cor_df_long[order(abs(cor_df_long$Correlation), decreasing = TRUE), ]
# Use ggplot2 to create the heatmap
ggplot(data = cor_df_long, aes(x = Neighborhood, y = reorder(Variable, Correlation), fill = Correlation)) +
geom_tile(color = "white") +
geom_text(aes(label = round(Correlation, 2)), color = "black", size = 4) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limits = c(-1, 1)) +
labs(y = "", x = "", title = "Correlation with kfr_pooled_pooled_p25") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
axis.ticks = element_blank())
## Warning: Removed 2 rows containing missing values (geom_text).
Principal Component Analysis
#PCA
# Load required libraries
library(ggplot2)
south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))
riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")
vars <- c("hhinc_mean2000", "mean_commutetime2000", "frac_coll_plus2010", "frac_coll_plus2000", "foreign_share2010", "med_hhinc2016", "med_hhinc1990", "popdensity2000", "poor_share2010", "poor_share2000", "poor_share1990", "share_black2010", "share_hisp2010", "share_asian2010", "share_black2000", "share_white2000", "share_hisp2000", "share_asian2000", "gsmn_math_g3_2013", "rent_twobed2015", "singleparent_share2010", "singleparent_share1990", "singleparent_share2000", "traveltime15_2010", "emp2000", "mail_return_rate2010", "ln_wage_growth_hs_grad", "jobs_total_5mi_2015", "jobs_highpay_5mi_2015", "nonwhite_share2010", "popdensity2010", "ann_avg_job_growth_2004_2013", "job_density_2013")
# Combine datasets into one dataframe
combined_df <- rbind(
cbind(riverdale_bronx[vars], Neighborhood = "Riverdale"),
cbind(south_bronx_bronx[vars], Neighborhood = "South Bronx")
)
# Identify constant or zero variance variables
constant_vars <- vars[sapply(combined_df[vars], function(col) length(unique(na.omit(col)))==1)]
# Remove constant or zero variance variables from your list of variables
vars_filtered <- setdiff(vars, constant_vars)
# Remove NA values
combined_df <- na.omit(combined_df)
# Perform PCA on the combined data
pca.combined <- prcomp(combined_df[vars_filtered], scale. = TRUE)
# Get PCA scores
scores.combined <- as.data.frame(pca.combined$x)
# Add Neighborhood column to the PCA scores
scores.combined$Neighborhood <- combined_df$Neighborhood
# Calculate percentage of variance explained
explained_variance <- round((pca.combined$sdev^2 / sum(pca.combined$sdev^2)) * 100, 2)
# Plot PCA results
ggplot(scores.combined, aes(x = PC1, y = PC2, color = Neighborhood)) +
geom_point() +
labs(x = paste0("PC1: ", explained_variance[1], "% variance"),
y = paste0("PC2: ", explained_variance[2], "% variance")) +
ggtitle("PCA for Riverdale and South Bronx") +
theme_minimal()