Import data
D75S <- read_tsv("masterfile11_d75_final.txt")
## Rows: 56 Columns: 1773
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): dbn, bn, schoolname, studentssurveyed, schooltype
## dbl (1739): d75, highschool, rr_s, rr_t, rr_p, N_s, N_t, N_p, nr_s, nr_t, nr...
## lgl (29): p_q5, p_q9, p_q13a, p_q13b, p_q13c, p_q13d, p_q14a, p_q14b, p_q1...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
generalS <- read_tsv("masterfile11_gened_final.txt")
## Rows: 1646 Columns: 1942
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): dbn, bn, schoolname, studentssurveyed, schooltype
## dbl (1904): d75, highschool, rr_s, rr_t, rr_p, N_s, N_t, N_p, nr_s, nr_t, nr...
## lgl (33): p_q1, p_q3d, p_q9, p_q10, p_q12aa, p_q12ab, p_q12ac, p_q12ad, p_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined <- read_csv("combined.csv")
## Rows: 479 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): DBN, school_name, boro
## dbl (27): Num of SAT Test Takers, SAT Critical Reading Avg. Score, SAT Math ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Simplify the survey dataframes to include only variables like dbn and variables that contain aggregate scores like saf_p_11 and com_s_11.
Filter the survey dataframes to include only observations for high schools.
d75_select <- D75S %>%
select(dbn:aca_tot_11)
general_select <- generalS %>%
select(dbn:aca_tot_11) %>%
filter(schooltype == 'High School')
Combine three dataframes. Use left_join()
to keep the
observations in the survey dataframe that correspond to observations in
combined.
survey_total <- d75_select %>%
bind_rows(general_select) %>%
rename(DBN = dbn)
combined_survej <- combined %>%
left_join(survey_total, by = 'DBN')
Correlation to show if student, teacher, and parent perceptions of NYC school quality appear to be related to demographic and academic success metrics.
cor_mat <- combined_survej %>%
select(avg_sat_score, contains('_11')) %>%
cor(use = 'pairwise.complete.obs')
cor_tib <- cor_mat %>%
as_tibble(rownames= 'variable')
Find strong correlations of other variables with
avg_sat_score
that are greater than 0.25 or less than -0.25
(strong correlations).
strong_cors <- cor_tib %>%
select(variable, avg_sat_score) %>%
filter(avg_sat_score > 0.25 | avg_sat_score < -0.25)
Create scatter plots to explore potentially interesting relationships between variables in greater detail.
strong_cors %>%
filter(variable != 'avg_sat_score') %>%
ggplot(aes(x = variable, y = avg_sat_score ))+
geom_point()+
labs(
title= 'Strong correlation between average SAT score and servey varibles',
x = 'Survey variables',
y = 'Average SAT score'
)
#Differences in Student, Parent, and Teacher Perceptions
reshape data to to long type.
combined_survej_long <- combined_survej %>%
pivot_longer(cols = saf_p_11:aca_tot_11,
names_to = 'survey_question',
values_to = 'score')
Use str_sub
to create new variables, group
and question
, from the survey_question
variable.
combined_survej_long <- combined_survej_long %>%
mutate(question = str_sub(survey_question,1,3)) %>%
mutate(group= str_sub(survey_question,4,6))
Use case_when
to rename values in group
and
question
.
combined_survej_long <- combined_survej_long %>%
mutate(question = case_when(
question == 'saf' ~ 'safety and respect',
question == 'com' ~ 'communication',
question == 'eng' ~ 'engagement',
question == 'aca' ~ 'academic expectations',
TRUE ~ 'NA'
)) %>%
mutate(group= case_when(
group == '_p_' ~ 'parent',
group == '_t_' ~ 'teacher',
group == '_s_' ~ 'student',
group == '_to' ~ 'total',
TRUE ~ 'NA'
))
Make a boxplot to see if there appears to be differences in how the three groups (parents, students, and teachers) answered the four questions.
combined_survej_long %>%
filter(group!='total') %>%
ggplot(aes(x = group, y = score, color=question))+
geom_boxplot()
## Warning: Removed 1268 rows containing non-finite values (stat_boxplot).
combined_survej_long %>%
filter(group!='total') %>%
ggplot(aes(x = question, y = score, color=group))+
geom_boxplot()
## Warning: Removed 1268 rows containing non-finite values (stat_boxplot).