library(dplyr)
library(ggplot2)
nyc_census_info <- read.csv("./2020_Census_Tracts_-_Tabular.csv", header = TRUE)

atlas2 <- read.csv("./atlas_modified.csv", header = TRUE)

nyc <- subset(atlas2, state == 36 & county %in% c(47, 61, 85, 81, 5))

nyc$BoroName <- ifelse(nyc$county == 47, "Brooklyn",
                       ifelse(nyc$county == 61, "Manhattan",
                              ifelse(nyc$county == 85, "Staten Island",
                                     ifelse(nyc$county == 81, "Queens",
                                            ifelse(nyc$county == 5, "Bronx", NA)))))

nyc_neigborhood_info <- select(nyc_census_info, BoroName, CT2020, NTAName)

nyc_neigborhood_info <- rename(nyc_neigborhood_info, tract = CT2020)
nyc2 <- merge(nyc, nyc_neigborhood_info[,c("BoroName", "tract", "NTAName")], by = c("BoroName", "tract"), all.x = TRUE)

names(nyc2)[which(names(nyc2) == "NTAName")] <- "neighborhood"


south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))

riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")

First Violin Plot

south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))

riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")


# Combine the two data sets into a single data frame
combined_data <- data.frame(
  value = c(south_bronx_bronx$kfr_pooled_pooled_p25, riverdale_bronx$kfr_pooled_pooled_p25),
  group = c(rep("South Bronx Bronx", length(south_bronx_bronx$kfr_pooled_pooled_p25)),
            rep("Riverdale Bronx", length(riverdale_bronx$kfr_pooled_pooled_p25))))

# Generate the violin plot
ggplot(combined_data, aes(x = group, y = value)) +
  geom_violin() +
  geom_jitter(width = 0.2, size = 1, alpha = 0.5) +  # Add raw data points
  geom_boxplot(width = 0.1, fill = "white") +  # Add box-and-whisker plot
  labs(x = "Group", y = "Value") +
  theme_minimal()
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).

#Second Violin Plot (better design)
# Load required library
library(ggplot2)

south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))

riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")


# Combine the two data sets into a single data frame
combined_data <- data.frame(
  value = c(south_bronx_bronx$kfr_pooled_pooled_p25, riverdale_bronx$kfr_pooled_pooled_p25),
  group = c(rep("South Bronx", length(south_bronx_bronx$kfr_pooled_pooled_p25)),
            rep("Riverdale", length(riverdale_bronx$kfr_pooled_pooled_p25))))

# Generate the violin plot
ggplot(combined_data, aes(x = group, y = value, fill = group)) +
  geom_violin(alpha = 0.5) +
  geom_jitter(width = 0.2, size = 1, alpha = 0.5, color = "black") +
  geom_boxplot(width = 0.1, fill = "white", alpha = 0.5, outlier.shape = NA) +
  labs(x = NULL, 
       y = "Value",
       title = "kfr_pooled_pooled_p25",
       fill = NULL) +
  theme_minimal() +
  theme(legend.position = "none", 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.ticks = element_blank(),
        plot.title = element_text(hjust = 0.5),  # Center the title
        axis.line = element_line(color = 'black'))  # Add x and y axis lines
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).

Violing Plots – all kfr

#Violing Plots -- all kfr
# Load required libraries
library(tidyverse)
library(tidyr)  # Load tidyr specifically

south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))

riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")


# List of variables to plot
variables <- c("kfr_pooled_pooled_p25", "kfr_black_pooled_p25", "kfr_hisp_pooled_p25", "kfr_white_pooled_p25", "kfr_pooled_female_p25", "kfr_pooled_male_p25", "kfr_black_female_p25", "kfr_hisp_female_p25", "kfr_white_female_p25", "kfr_black_male_p25", "kfr_hisp_male_p25", "kfr_white_male_p25")

# Function to reshape data
reshape_data <- function(data, group_name) {
  data %>% 
    select(one_of(variables)) %>%  # select required variables
    gather(key = "variable", value = "value") %>%  # convert to long format
    mutate(group = group_name)  # add group column
}

# Reshape data
south_bronx_bronx <- reshape_data(south_bronx_bronx, "South Bronx")
riverdale_bronx <- reshape_data(riverdale_bronx, "Riverdale")

# Combine the two data sets into a single data frame
combined_data <- rbind(south_bronx_bronx, riverdale_bronx)

# Filter out missing values
combined_data <- combined_data %>% filter(!is.na(value))

# Define the desired order
x_order <- c("kfr_pooled_pooled_p25_Riverdale", "kfr_pooled_pooled_p25_South Bronx", "kfr_white_pooled_p25_Riverdale", "kfr_white_pooled_p25_South Bronx", "kfr_black_pooled_p25_Riverdale", "kfr_black_pooled_p25_South Bronx", "kfr_hisp_pooled_p25_Riverdale", "kfr_hisp_pooled_p25_South Bronx", "kfr_pooled_female_p25_Riverdale", "kfr_pooled_female_p25_South Bronx", "kfr_pooled_male_p25_Riverdale", "kfr_pooled_male_p25_South Bronx", "kfr_white_female_p25_Riverdale", "kfr_white_female_p25_South Bronx", "kfr_white_male_p25_Riverdale", "kfr_white_male_p25_South Bronx", "kfr_black_female_p25_Riverdale", "kfr_black_female_p25_South Bronx", "kfr_black_male_p25_Riverdale", "kfr_black_male_p25_South Bronx", "kfr_hisp_female_p25_Riverdale", "kfr_hisp_female_p25_South Bronx", "kfr_hisp_male_p25_Riverdale", "kfr_hisp_male_p25_South Bronx")

# Update the variable_group column in combined_data to an ordered factor with the levels specified in x_order
combined_data$variable_group <- factor(paste(combined_data$variable, combined_data$group, sep = "_"), levels = x_order)

# Generate the violin plot
plot <- ggplot(combined_data, aes(x = variable_group, y = value, fill = group)) +
  geom_violin(alpha = 0.5, position = position_dodge(0.8)) +
  geom_jitter(width = 0.2, size = 1, alpha = 0.7) +  # Add raw data points
  geom_boxplot(width = 0.2, fill = "white", position = position_dodge(0.8), outlier.shape = NA) +
  labs(x = NULL, 
       y = "Value",
       fill = NULL, 
       title = "Social Mobility: Riverdale vs. South Bronx") +
  theme_minimal() +
  theme(legend.position = "none", 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_text(angle = 90, hjust = 1),  # Rotate x-axis labels for readability
        plot.title = element_text(hjust = 0.5),  # Center the title
        axis.line = element_line(color = 'black'))  # Add x and y axis lines

# Display the plot
print(plot)
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.

Correlate riverdale_bronx$kfr_pooled_pooled_p25 to all relevant vars

# Variables to correlate with kfr_pooled_pooled_p25

south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))

riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")


# List of variables to correlate with kfr_pooled_pooled_p25
vars <- c("hhinc_mean2000", "mean_commutetime2000", "frac_coll_plus2010", "frac_coll_plus2000", "foreign_share2010", "med_hhinc2016", "med_hhinc1990", "popdensity2000", "poor_share2010", "poor_share2000", "poor_share1990", "share_black2010", "share_hisp2010", "share_asian2010", "share_black2000", "share_white2000", "share_hisp2000", "share_asian2000", "gsmn_math_g3_2013", "rent_twobed2015", "singleparent_share2010", "singleparent_share1990", "singleparent_share2000", "traveltime15_2010", "emp2000", "mail_return_rate2010", "ln_wage_growth_hs_grad", "jobs_total_5mi_2015", "jobs_highpay_5mi_2015", "nonwhite_share2010", "popdensity2010", "ann_avg_job_growth_2004_2013", "job_density_2013")

# Calculate the correlations
cor_values <- sapply(vars, function(v) cor(riverdale_bronx$kfr_pooled_pooled_p25, riverdale_bronx[[v]], use="complete.obs"))
## Warning in cor(riverdale_bronx$kfr_pooled_pooled_p25, riverdale_bronx[[v]], :
## the standard deviation is zero
# Create a dataframe with the correlations
cor_df <- data.frame(Variable = vars, Correlation = cor_values)

# Order the dataframe by absolute value of correlation
cor_df <- cor_df[order(abs(cor_df$Correlation), decreasing = TRUE), ]

# Use ggplot2 to create the heatmap
library(ggplot2)
ggplot(data = cor_df, aes(x = 1, y = reorder(Variable, Correlation), fill = Correlation)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(Correlation, 2)), color = "black", size = 4) +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limits = c(-1, 1)) +
  labs(y = "", x = "", title = "Correlation with kfr_pooled_pooled_p25") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        axis.ticks = element_blank())
## Warning: Removed 1 rows containing missing values (geom_text).

Add south_bronx_bronx$kfr_pooled_pooled_p25 correlated to all relevant vars

south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))

riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")

# List of variables to correlate with kfr_pooled_pooled_p25
vars <- c("hhinc_mean2000", "mean_commutetime2000", "frac_coll_plus2010", "frac_coll_plus2000", "foreign_share2010", "med_hhinc2016", "med_hhinc1990", "popdensity2000", "poor_share2010", "poor_share2000", "poor_share1990", "share_black2010", "share_hisp2010", "share_asian2010", "share_black2000", "share_white2000", "share_hisp2000", "share_asian2000", "gsmn_math_g3_2013", "rent_twobed2015", "singleparent_share2010", "singleparent_share1990", "singleparent_share2000", "traveltime15_2010", "emp2000", "mail_return_rate2010", "ln_wage_growth_hs_grad", "jobs_total_5mi_2015", "jobs_highpay_5mi_2015", "nonwhite_share2010", "popdensity2010", "ann_avg_job_growth_2004_2013", "job_density_2013")

# Calculate the correlations for both riverdale_bronx and south_bronx_bronx
cor_values_riverdale <- sapply(vars, function(v) cor(riverdale_bronx$kfr_pooled_pooled_p25, riverdale_bronx[[v]], use="complete.obs"))
## Warning in cor(riverdale_bronx$kfr_pooled_pooled_p25, riverdale_bronx[[v]], :
## the standard deviation is zero
cor_values_south_bronx <- sapply(vars, function(v) cor(south_bronx_bronx$kfr_pooled_pooled_p25, south_bronx_bronx[[v]], use="complete.obs"))
## Warning in cor(south_bronx_bronx$kfr_pooled_pooled_p25,
## south_bronx_bronx[[v]], : the standard deviation is zero
# Create a dataframe with the correlations
cor_df <- data.frame(Variable = vars, 
                     Correlation_Riverdale = cor_values_riverdale, 
                     Correlation_South_Bronx = cor_values_south_bronx)

# Reshape data to long format using gather()
cor_df_long <- tidyr::gather(cor_df, key = "Neighborhood", value = "Correlation", -Variable)

# Order the dataframe by absolute value of correlation
cor_df_long <- cor_df_long[order(abs(cor_df_long$Correlation), decreasing = TRUE), ]

# Use ggplot2 to create the heatmap
ggplot(data = cor_df_long, aes(x = Neighborhood, y = reorder(Variable, Correlation), fill = Correlation)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(Correlation, 2)), color = "black", size = 4) +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limits = c(-1, 1)) +
  labs(y = "", x = "", title = "Correlation with kfr_pooled_pooled_p25") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), 
        axis.ticks = element_blank())
## Warning: Removed 2 rows containing missing values (geom_text).

Principal Component Analysis

#PCA

# Load required libraries
library(ggplot2)


south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))

riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")

vars <- c("hhinc_mean2000", "mean_commutetime2000", "frac_coll_plus2010", "frac_coll_plus2000", "foreign_share2010", "med_hhinc2016", "med_hhinc1990", "popdensity2000", "poor_share2010", "poor_share2000", "poor_share1990", "share_black2010", "share_hisp2010", "share_asian2010", "share_black2000", "share_white2000", "share_hisp2000", "share_asian2000", "gsmn_math_g3_2013", "rent_twobed2015", "singleparent_share2010", "singleparent_share1990", "singleparent_share2000", "traveltime15_2010", "emp2000", "mail_return_rate2010", "ln_wage_growth_hs_grad", "jobs_total_5mi_2015", "jobs_highpay_5mi_2015", "nonwhite_share2010", "popdensity2010", "ann_avg_job_growth_2004_2013", "job_density_2013")


# Combine datasets into one dataframe
combined_df <- rbind(
  cbind(riverdale_bronx[vars], Neighborhood = "Riverdale"),
  cbind(south_bronx_bronx[vars], Neighborhood = "South Bronx")
)

# Identify constant or zero variance variables
constant_vars <- vars[sapply(combined_df[vars], function(col) length(unique(na.omit(col)))==1)]

# Remove constant or zero variance variables from your list of variables
vars_filtered <- setdiff(vars, constant_vars)

# Remove NA values
combined_df <- na.omit(combined_df)

# Perform PCA on the combined data
pca.combined <- prcomp(combined_df[vars_filtered], scale. = TRUE)

# Get PCA scores
scores.combined <- as.data.frame(pca.combined$x)

# Add Neighborhood column to the PCA scores
scores.combined$Neighborhood <- combined_df$Neighborhood

# Calculate percentage of variance explained
explained_variance <- round((pca.combined$sdev^2 / sum(pca.combined$sdev^2)) * 100, 2)

# Plot PCA results
ggplot(scores.combined, aes(x = PC1, y = PC2, color = Neighborhood)) +
  geom_point() +
  labs(x = paste0("PC1: ", explained_variance[1], "% variance"),
       y = paste0("PC2: ", explained_variance[2], "% variance")) +
  ggtitle("PCA for Riverdale and South Bronx") +
  theme_minimal()