This R Markdown script reproduces Figure 6 from the paper, Ginda, M., Richey, M. C., Cousino, M., & Börner, K. (2019). Visualizing learner engagement, performance, and trajectories to evaluate and optimize online course design. PloS one, 14(5), e0215964.
The visualization documented here use analytic results for the MITxPro course, Architecture of Complex Systems (MITProfessionalX+SysEngxB1+3T2016), Fall 2016, with the edX Learner and Course Analytics Pipeline. More information about the data used in this visualization is available at Sample Data Index.
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
#Clean environment
rm(list=ls())
options(scipen=90)
#Load required packages
require("RCurl") #Loading data from web
require("grid") #Visualizations base
require("reshape2") #Reshape data package
require("colorspace") #ColorSpace color pallete selection
require("ggplot2") #GGplot 2 graphics library
require("GGally")
The data set loaded as students was created by the script edX-7-studentFeatureExtraction.R. Data represents the overall performance and interaction statistics for each active student in the course, based on their log activity for the full duration of the course.
#Load Sample Data Set D
students <- read.csv(text=getURL("https://raw.githubusercontent.com/cns-iu/edx-learnertrajectorynetpipeline/master/data/dataD.csv",ssl.verifypeer = FALSE), header=T)
str(students)
## 'data.frame': 1565 obs. of 35 variables:
## $ user_id : logi NA NA NA NA NA NA ...
## $ grade : num 0.94 0 0.94 0.83 0.97 0.52 0 1 0.94 0.96 ...
## $ cert_status : Factor w/ 2 levels "downloadable",..: 1 2 1 1 1 2 2 1 1 1 ...
## $ gender : logi NA NA NA NA NA NA ...
## $ yob : logi NA NA NA NA NA NA ...
## $ loe : logi NA NA NA NA NA NA ...
## $ sessions : int 65 22 53 45 54 28 5 24 48 37 ...
## $ days_unq : int 38 12 29 31 28 14 4 18 36 31 ...
## $ mods_unq : int 284 141 291 255 291 150 86 289 289 274 ...
## $ vid_mods : int 46 23 47 47 47 26 12 47 48 37 ...
## $ prb_mod : int 130 64 139 102 139 65 40 139 139 136 ...
## $ oa_mods : int 5 1 5 5 5 2 1 4 5 5 ...
## $ events : int 4110 439 1200 747 954 680 254 900 832 985 ...
## $ vid_events : int 3379 138 497 273 339 315 73 302 285 280 ...
## $ prb_events : int 224 90 271 150 256 133 98 267 225 267 ...
## $ oa_events : int 43 5 47 21 47 13 3 41 47 50 ...
## $ oa_peerAccessEvents: int 9 0 9 3 9 0 0 9 9 9 ...
## $ oa_getPeerEvents : int 18 0 20 6 20 0 0 19 21 26 ...
## $ seqNextEvents : int 66 37 67 64 61 33 23 58 64 64 ...
## $ seqPrevEvents : int 13 0 7 7 12 11 3 5 11 10 ...
## $ seqGotoEvents : int 16 5 18 23 22 14 0 24 0 3 ...
## $ modAccessEvents : int 54 31 54 52 54 32 20 53 54 53 ...
## $ total_time : num 3236 1074 2738 1369 2021 ...
## $ vid_time : num 1724 265 776 436 698 ...
## $ prb_time : num 187 62.7 220.1 96.6 163.1 ...
## $ oa_time : num 200.37 3.88 168.27 12.42 199.32 ...
## $ oa_peerAccessTime : num 5.517 0 47.25 0.467 23.9 ...
## $ oa_getPeerTime : num 107.62 0 100.72 5.43 165.03 ...
## $ seqNextTime : num 462 443 816 347 458 ...
## $ seqPrevTime : num 29 0 27.68 42.73 7.68 ...
## $ seqGotoTime : num 61.55 2.75 103.07 87.1 74.03 ...
## $ modAccessTime : num 443 223 356 344 342 ...
## $ prb_attempts : int 185 85 211 142 217 96 83 202 199 222 ...
## $ prb_correct : int 119 62 122 93 150 58 72 126 121 120 ...
## $ prb_totalPoints : int 128 67 133 97 159 63 74 140 131 132 ...
The multiplot function allows for multiple visualizations to be added into a single image. The multiplot function for ggplot2 was taken from the Winston Chang. (2017). Cookbook for R. http://www.cookbook-r.com/.
#Layout
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
Before visualizing the results, ggplot2 package themes are set. A categorical color palettes is generate using the colorspace package. Labels used for the student groups based on certificate status.
#Theme for ggplot2
theme_set(theme_light())
##Color Scales and setting for graphs
strip <- c("#DCDCDC") #Grey
#Pink/Orange/Yellow
pal2 <- function (n, h = c(-83, 43), c. = c(100, 43), l = c(56, 86),
power = c(0.166666666666667, 0.9), fixup = TRUE, gamma = NULL,
alpha = 1, ...)
{
if (!is.null(gamma))
warning("'gamma' is deprecated and has no effect")
if (n < 1L)
return(character(0L))
h <- rep(h, length.out = 2L)
c <- rep(c., length.out = 2L)
l <- rep(l, length.out = 2L)
power <- rep(power, length.out = 2L)
rval <- seq(1, 0, length = n)
rval <- hex(polarLUV(L = l[2L] - diff(l) * rval^power[2L],
C = c[2L] - diff(c) * rval^power[1L], H = h[2L] - diff(h) *
rval), fixup = fixup, ...)
if (!missing(alpha)) {
alpha <- pmax(pmin(alpha, 1), 0)
alpha <- format(as.hexmode(round(alpha * 255 + 1e-04)),
width = 2L, upper.case = TRUE)
rval <- paste(rval, alpha, sep = "")
}
return(rval)
}
#Color Scales for Student Certification Cohort Assignments
stdColors <- pal2(3)
##Label groups
#Certification Groups
userGrps <- as_labeller(c('unq_stu_per' = paste0("All students (", length(students$user_id)," students)"),
'unq_stu_per.1' = paste0("Certificate granted, grades between 100%-70% (",
length(students[students$grade>=.7,]$user_id)," students)"),
'unq_stu_per.2' = paste0("No certification but active, with grades less than 70% (",
length(students[students$grade<.7,]$user_id)," students)")))
userGrps2 <- as_labeller(c('unq_stu_per' = paste0("All students\n(", length(students$user_id)," students)"),
'unq_stu_per.1' = paste0("Certificate granted\n(",
length(students[students$grade>=.7,]$user_id)," students)"),
'unq_stu_per.2' = paste0("No certification active\n(",
length(students[students$grade<.7,]$user_id)," students)")))
#6A Grades vs Events (Unq.)
m <- glm(grade ~ poly(events,3), data = students[students$grade>0,])
eq <- substitute(italic(y) == b %.%italic(x)^3* c %.%italic(x)^2* + d %.%italic(x)* + a*","~~italic(r)^2~"="~r2,
list( a=format(coef(m)[1], digits=3),
b=format(coef(m)[2], digits=3),
c=format(coef(m)[3], digits=3),
d=format(coef(m)[4], digits=3),
r2=format(1-(m$deviance/m$null.deviance), digits = 3)))
p4 <- ggplot(students[students$grade>0 & abs(scale(students$events))<=5,], aes(x=grade,y=events)) +
geom_point(aes(color=cert_status)) +
geom_smooth(method = "glm", formula = y ~ poly(x,3),fullrange=F) +
geom_text(x = .43, y = 2264, aes(label = eq), data=data.frame(eq=as.character(as.expression(eq))), parse=TRUE) +
scale_colour_manual(values=stdColors[1:2]) +
labs(x="Final Grade",y="Events (within 5 sd. of mean)") +
theme(legend.position="none")
p4
#6B Grades vs Mods Accessed (Unq.)
m <- glm(grade ~ poly(mods_unq,3), data = students[students$grade>0,])
eq <- substitute(italic(y) == b %.%italic(x)^3* c %.%italic(x)^2* + d %.%italic(x)* + a*","~~italic(r)^2~"="~r2,
list( a=format(coef(m)[1], digits=3),
b=format(coef(m)[2], digits=3),
c=format(coef(m)[3], digits=3),
d=format(coef(m)[4], digits=3),
r2=format(1-(m$deviance/m$null.deviance), digits = 3)))
p3 <- ggplot(students[students$grade>0,], aes(x=grade,y=mods_unq)) +
geom_point(aes(color=cert_status)) +
geom_smooth(method = "glm", formula = y ~ poly(x,3),fullrange=F) +
geom_text(x = .46, y = 258, aes(label = eq), data=data.frame(eq=as.character(as.expression(eq))), parse=TRUE) +
scale_colour_manual(values=stdColors[1:2]) +
labs(x="Final Grade",y="Modules Accessed (Unq.)") +
theme(legend.position="none")
p3
#6C Grades vs Open Assessment Events
m <- glm(grade ~ poly(oa_events,3), data = students[students$grade>0,])
eq <- substitute(italic(y) == b %.%italic(x)^3* c %.%italic(x)^2* + d %.%italic(x)* + a*","~~italic(r)^2~"="~r2,
list( a=format(coef(m)[1], digits=3),
b=format(coef(m)[2], digits=3),
c=format(coef(m)[3], digits=3),
d=format(coef(m)[4], digits=3),
r2=format(1-(m$deviance/m$null.deviance), digits = 3)))
p5 <- ggplot(students[students$grade>0 & abs(scale(students$oa_events))<=2,], aes(x=grade,y=oa_events)) +
geom_point(aes(color=cert_status)) +
geom_smooth(method = "glm", formula = y ~ poly(x,3),fullrange=F) +
geom_text(x = .43, y = 61.5, aes(label = eq), data=data.frame(eq=as.character(as.expression(eq))), parse=TRUE) +
scale_colour_manual(values=stdColors[1:2]) +
labs(x="Final Grade",y="Open Assessment Events (within 2 sd. of mean)") +
theme(legend.position="none")
p5
#6D Events vs Sessions
m <- glm(sessions ~ poly(events,3), data = students[students$grade>0 & abs(scale(students$events))<=5,])
eq <- substitute(italic(y) == b %.%italic(x)^3* c %.%italic(x)^2* + d %.%italic(x)* + a*","~~italic(r)^2~"="~r2,
list( a=format(coef(m)[1], digits=3),
b=format(coef(m)[2], digits=3),
c=format(coef(m)[3], digits=3),
d=format(coef(m)[4], digits=3),
r2=format(1-(m$deviance/m$null.deviance), digits = 3)))
p7 <- ggplot(students[students$grade>0 & abs(scale(students$events))<=5,], aes(x=events,y=sessions)) +
geom_point(aes(color=cert_status)) +
geom_smooth(method = "glm", formula = y ~ poly(x,3),fullrange=F) +
geom_text(x =1150, y=145, aes(label = eq), data=data.frame(eq=as.character(as.expression(eq))), parse=TRUE) +
scale_colour_manual(values=stdColors[1:2]) +
labs(x="Events",y="Sessions")+
theme(legend.position="none")
p7
#6E Events vs Problem Attempts
p10 <- ggplot(students[students$grade>0 & abs(scale(students$events))<=4,], aes(x=events,y=prb_attempts)) +
geom_point(aes(color=cert_status)) +
scale_colour_manual(values=stdColors[1:2]) +
labs(x="Events",y="Problem Attempts") +
theme(legend.position="none")
p10
#6F Events vs OA Events
p11 <- ggplot(students[students$grade>0 & abs(scale(students$events))<=5,], aes(x=events,y=oa_events)) +
geom_point(aes(color=cert_status)) +
scale_colour_manual(values=stdColors[1:2]) +
labs(x="Events",y="Open Assessment Events") +
theme(legend.position="none")
p11
multiplot(p4+ggtitle("A")+ theme(plot.title = element_text(hjust = 0)),
p3+ggtitle("B")+ theme(plot.title = element_text(hjust = 0)),
p5+ggtitle("C")+ theme(plot.title = element_text(hjust = 0)),
p7+ggtitle("D")+ theme(plot.title = element_text(hjust = 0)),
p10+ggtitle("E")+ theme(plot.title = element_text(hjust = 0)),
p11+ggtitle("F")+ theme(plot.title = element_text(hjust = 0)),
cols=2)