NYC school

Data cleaning

Import data

D75S <-  read_tsv("masterfile11_d75_final.txt")
## Rows: 56 Columns: 1773
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr    (5): dbn, bn, schoolname, studentssurveyed, schooltype
## dbl (1739): d75, highschool, rr_s, rr_t, rr_p, N_s, N_t, N_p, nr_s, nr_t, nr...
## lgl   (29): p_q5, p_q9, p_q13a, p_q13b, p_q13c, p_q13d, p_q14a, p_q14b, p_q1...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
generalS <- read_tsv("masterfile11_gened_final.txt")
## Rows: 1646 Columns: 1942
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr    (5): dbn, bn, schoolname, studentssurveyed, schooltype
## dbl (1904): d75, highschool, rr_s, rr_t, rr_p, N_s, N_t, N_p, nr_s, nr_t, nr...
## lgl   (33): p_q1, p_q3d, p_q9, p_q10, p_q12aa, p_q12ab, p_q12ac, p_q12ad, p_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined <- read_csv("combined.csv")
## Rows: 479 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): DBN, school_name, boro
## dbl (27): Num of SAT Test Takers, SAT Critical Reading Avg. Score, SAT Math ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Simplify the survey dataframes to include only variables like dbn and variables that contain aggregate scores like saf_p_11 and com_s_11.

Filter the survey dataframes to include only observations for high schools.

d75_select <- D75S %>% 
  select(dbn:aca_tot_11)

general_select <- generalS %>% 
  select(dbn:aca_tot_11) %>% 
  filter(schooltype == 'High School')

Combine three dataframes. Use left_join() to keep the observations in the survey dataframe that correspond to observations in combined.

survey_total <- d75_select %>% 
  bind_rows(general_select) %>% 
  rename(DBN = dbn)

combined_survej <- combined %>% 
  left_join(survey_total, by = 'DBN')

Correlation to show if student, teacher, and parent perceptions of NYC school quality appear to be related to demographic and academic success metrics.

cor_mat <- combined_survej %>% 
  select(avg_sat_score, contains('_11')) %>% 
  cor(use = 'pairwise.complete.obs')

cor_tib <- cor_mat %>% 
  as_tibble(rownames= 'variable')

Find strong correlations of other variables with avg_sat_score that are greater than 0.25 or less than -0.25 (strong correlations).

strong_cors <- cor_tib %>%
  select(variable, avg_sat_score) %>%
  filter(avg_sat_score > 0.25 | avg_sat_score < -0.25)  

Create scatter plots to explore potentially interesting relationships between variables in greater detail.

strong_cors %>% 
  filter(variable != 'avg_sat_score') %>% 
  ggplot(aes(x = variable, y = avg_sat_score ))+
  geom_point()+
  labs(
    title= 'Strong correlation between average SAT score and servey varibles',
    x = 'Survey variables',
    y = 'Average SAT score'
  )

#Differences in Student, Parent, and Teacher Perceptions

reshape data to to long type.

combined_survej_long <- combined_survej %>% 
  pivot_longer(cols = saf_p_11:aca_tot_11,
               names_to = 'survey_question',
               values_to = 'score')

Use str_sub to create new variables, group and question, from the survey_question variable.

combined_survej_long <- combined_survej_long %>% 
  mutate(question = str_sub(survey_question,1,3)) %>% 
  mutate(group= str_sub(survey_question,4,6))

Use case_when to rename values in group and question.

combined_survej_long <- combined_survej_long %>% 
  mutate(question = case_when(
    question == 'saf' ~ 'safety and respect',
    question == 'com' ~ 'communication',
    question == 'eng' ~ 'engagement',
    question == 'aca' ~ 'academic expectations',
    TRUE ~ 'NA'
  )) %>% 
  mutate(group= case_when(
    group == '_p_' ~ 'parent',
    group == '_t_' ~ 'teacher',
    group == '_s_' ~ 'student',
    group == '_to' ~ 'total',
    TRUE ~ 'NA'
  ))

Make a boxplot to see if there appears to be differences in how the three groups (parents, students, and teachers) answered the four questions.

combined_survej_long %>% 
  filter(group!='total') %>% 
  ggplot(aes(x = group, y = score, color=question))+
  geom_boxplot()
## Warning: Removed 1268 rows containing non-finite values (stat_boxplot).

combined_survej_long %>% 
  filter(group!='total') %>% 
  ggplot(aes(x = question, y = score, color=group))+
  geom_boxplot()
## Warning: Removed 1268 rows containing non-finite values (stat_boxplot).