library(dplyr)
library(ggplot2)
nyc_census_info <- read.csv("./2020_Census_Tracts_-_Tabular.csv", header = TRUE)

atlas2 <- read.csv("./atlas_modified.csv", header = TRUE)

nyc <- subset(atlas2, state == 36 & county %in% c(47, 61, 85, 81, 5))

nyc$BoroName <- ifelse(nyc$county == 47, "Brooklyn",
                       ifelse(nyc$county == 61, "Manhattan",
                              ifelse(nyc$county == 85, "Staten Island",
                                     ifelse(nyc$county == 81, "Queens",
                                            ifelse(nyc$county == 5, "Bronx", NA)))))

nyc_neigborhood_info <- select(nyc_census_info, BoroName, CT2020, NTAName)

nyc_neigborhood_info <- rename(nyc_neigborhood_info, tract = CT2020)
nyc2 <- merge(nyc, nyc_neigborhood_info[,c("BoroName", "tract", "NTAName")], by = c("BoroName", "tract"), all.x = TRUE)

names(nyc2)[which(names(nyc2) == "NTAName")] <- "neighborhood"


south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))

riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")

First Violin Plot

south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))

riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")


# Combine the two data sets into a single data frame
combined_data <- data.frame(
  value = c(south_bronx_bronx$kfr_pooled_pooled_p25, riverdale_bronx$kfr_pooled_pooled_p25),
  group = c(rep("South Bronx Bronx", length(south_bronx_bronx$kfr_pooled_pooled_p25)),
            rep("Riverdale Bronx", length(riverdale_bronx$kfr_pooled_pooled_p25))))

# Generate the violin plot
ggplot(combined_data, aes(x = group, y = value)) +
  geom_violin() +
  geom_jitter(width = 0.2, size = 1, alpha = 0.5) +  # Add raw data points
  geom_boxplot(width = 0.1, fill = "white") +  # Add box-and-whisker plot
  labs(x = "Group", y = "Value") +
  theme_minimal()
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).

#Second Violin Plot (better design)
# Load required library
library(ggplot2)

south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))

riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")


# Combine the two data sets into a single data frame
combined_data <- data.frame(
  value = c(south_bronx_bronx$kfr_pooled_pooled_p25, riverdale_bronx$kfr_pooled_pooled_p25),
  group = c(rep("South Bronx", length(south_bronx_bronx$kfr_pooled_pooled_p25)),
            rep("Riverdale", length(riverdale_bronx$kfr_pooled_pooled_p25))))

# Generate the violin plot
ggplot(combined_data, aes(x = group, y = value, fill = group)) +
  geom_violin(alpha = 0.5) +
  geom_jitter(width = 0.2, size = 1, alpha = 0.5, color = "black") +
  geom_boxplot(width = 0.1, fill = "white", alpha = 0.5, outlier.shape = NA) +
  labs(x = NULL, 
       y = "Value",
       title = "kfr_pooled_pooled_p25",
       fill = NULL) +
  theme_minimal() +
  theme(legend.position = "none", 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.ticks = element_blank(),
        plot.title = element_text(hjust = 0.5),  # Center the title
        axis.line = element_line(color = 'black'))  # Add x and y axis lines
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).

Violing Plots – all kfr

#Violing Plots -- all kfr
# Load required libraries
library(tidyverse)
library(tidyr)  # Load tidyr specifically

south_bronx_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood %in% c("Mott Haven-Port Morris", "Concourse-Concourse Village", "Melrose"))

riverdale_bronx <- subset(nyc2, BoroName == "Bronx" & neighborhood == "Riverdale-Spuyten Duyvil")


# List of variables to plot
variables <- c("kfr_pooled_pooled_p25", "kfr_black_pooled_p25", "kfr_hisp_pooled_p25", "kfr_white_pooled_p25", "kfr_pooled_female_p25", "kfr_pooled_male_p25", "kfr_black_female_p25", "kfr_hisp_female_p25", "kfr_white_female_p25", "kfr_black_male_p25", "kfr_hisp_male_p25", "kfr_white_male_p25")

# Function to reshape data
reshape_data <- function(data, group_name) {
  data %>% 
    select(one_of(variables)) %>%  # select required variables
    gather(key = "variable", value = "value") %>%  # convert to long format
    mutate(group = group_name)  # add group column
}

# Reshape data
south_bronx_bronx <- reshape_data(south_bronx_bronx, "South Bronx")
riverdale_bronx <- reshape_data(riverdale_bronx, "Riverdale")

# Combine the two data sets into a single data frame
combined_data <- rbind(south_bronx_bronx, riverdale_bronx)

# Filter out missing values
combined_data <- combined_data %>% filter(!is.na(value))

# Define the desired order
x_order <- c("kfr_pooled_pooled_p25_Riverdale", "kfr_pooled_pooled_p25_South Bronx", "kfr_white_pooled_p25_Riverdale", "kfr_white_pooled_p25_South Bronx", "kfr_black_pooled_p25_Riverdale", "kfr_black_pooled_p25_South Bronx", "kfr_hisp_pooled_p25_Riverdale", "kfr_hisp_pooled_p25_South Bronx", "kfr_pooled_female_p25_Riverdale", "kfr_pooled_female_p25_South Bronx", "kfr_pooled_male_p25_Riverdale", "kfr_pooled_male_p25_South Bronx", "kfr_white_female_p25_Riverdale", "kfr_white_female_p25_South Bronx", "kfr_white_male_p25_Riverdale", "kfr_white_male_p25_South Bronx", "kfr_black_female_p25_Riverdale", "kfr_black_female_p25_South Bronx", "kfr_black_male_p25_Riverdale", "kfr_black_male_p25_South Bronx", "kfr_hisp_female_p25_Riverdale", "kfr_hisp_female_p25_South Bronx", "kfr_hisp_male_p25_Riverdale", "kfr_hisp_male_p25_South Bronx")

# Update the variable_group column in combined_data to an ordered factor with the levels specified in x_order
combined_data$variable_group <- factor(paste(combined_data$variable, combined_data$group, sep = "_"), levels = x_order)

# Generate the violin plot
plot <- ggplot(combined_data, aes(x = variable_group, y = value, fill = group)) +
  geom_violin(alpha = 0.5, position = position_dodge(0.8)) +
  geom_jitter(width = 0.2, size = 1, alpha = 0.7) +  # Add raw data points
  geom_boxplot(width = 0.2, fill = "white", position = position_dodge(0.8), outlier.shape = NA) +
  labs(x = NULL, 
       y = "Value",
       fill = NULL, 
       title = "Social Mobility: Riverdale vs. South Bronx") +
  theme_minimal() +
  theme(legend.position = "none", 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_text(angle = 90, hjust = 1),  # Rotate x-axis labels for readability
        plot.title = element_text(hjust = 0.5),  # Center the title
        axis.line = element_line(color = 'black'))  # Add x and y axis lines

# Display the plot
print(plot)
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.