This R Markdown script reproduces Figure 6 from the paper, Ginda, M., Richey, M. C., Cousino, M., & Börner, K. (2019). Visualizing learner engagement, performance, and trajectories to evaluate and optimize online course design. PloS one, 14(5), e0215964.

The visualization documented here use analytic results for the MITxPro course, Architecture of Complex Systems (MITProfessionalX+SysEngxB1+3T2016), Fall 2016, with the edX Learner and Course Analytics Pipeline. More information about the data used in this visualization is available at Sample Data Index.

Environment Setup

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

#Clean environment
rm(list=ls())
options(scipen=90)

#Load required packages
require("RCurl")      #Loading data from web
require("grid")       #Visualizations base
require("reshape2")   #Reshape data package
require("colorspace") #ColorSpace color pallete selection
require("ggplot2")    #GGplot 2 graphics library
require("GGally") 

Loading Data

The data set loaded as students was created by the script edX-7-studentFeatureExtraction.R. Data represents the overall performance and interaction statistics for each active student in the course, based on their log activity for the full duration of the course.

#Load Sample Data Set D
students <- read.csv(text=getURL("https://raw.githubusercontent.com/cns-iu/edx-learnertrajectorynetpipeline/master/data/dataD.csv",ssl.verifypeer = FALSE), header=T)
str(students)
## 'data.frame':    1565 obs. of  35 variables:
##  $ user_id            : logi  NA NA NA NA NA NA ...
##  $ grade              : num  0.94 0 0.94 0.83 0.97 0.52 0 1 0.94 0.96 ...
##  $ cert_status        : Factor w/ 2 levels "downloadable",..: 1 2 1 1 1 2 2 1 1 1 ...
##  $ gender             : logi  NA NA NA NA NA NA ...
##  $ yob                : logi  NA NA NA NA NA NA ...
##  $ loe                : logi  NA NA NA NA NA NA ...
##  $ sessions           : int  65 22 53 45 54 28 5 24 48 37 ...
##  $ days_unq           : int  38 12 29 31 28 14 4 18 36 31 ...
##  $ mods_unq           : int  284 141 291 255 291 150 86 289 289 274 ...
##  $ vid_mods           : int  46 23 47 47 47 26 12 47 48 37 ...
##  $ prb_mod            : int  130 64 139 102 139 65 40 139 139 136 ...
##  $ oa_mods            : int  5 1 5 5 5 2 1 4 5 5 ...
##  $ events             : int  4110 439 1200 747 954 680 254 900 832 985 ...
##  $ vid_events         : int  3379 138 497 273 339 315 73 302 285 280 ...
##  $ prb_events         : int  224 90 271 150 256 133 98 267 225 267 ...
##  $ oa_events          : int  43 5 47 21 47 13 3 41 47 50 ...
##  $ oa_peerAccessEvents: int  9 0 9 3 9 0 0 9 9 9 ...
##  $ oa_getPeerEvents   : int  18 0 20 6 20 0 0 19 21 26 ...
##  $ seqNextEvents      : int  66 37 67 64 61 33 23 58 64 64 ...
##  $ seqPrevEvents      : int  13 0 7 7 12 11 3 5 11 10 ...
##  $ seqGotoEvents      : int  16 5 18 23 22 14 0 24 0 3 ...
##  $ modAccessEvents    : int  54 31 54 52 54 32 20 53 54 53 ...
##  $ total_time         : num  3236 1074 2738 1369 2021 ...
##  $ vid_time           : num  1724 265 776 436 698 ...
##  $ prb_time           : num  187 62.7 220.1 96.6 163.1 ...
##  $ oa_time            : num  200.37 3.88 168.27 12.42 199.32 ...
##  $ oa_peerAccessTime  : num  5.517 0 47.25 0.467 23.9 ...
##  $ oa_getPeerTime     : num  107.62 0 100.72 5.43 165.03 ...
##  $ seqNextTime        : num  462 443 816 347 458 ...
##  $ seqPrevTime        : num  29 0 27.68 42.73 7.68 ...
##  $ seqGotoTime        : num  61.55 2.75 103.07 87.1 74.03 ...
##  $ modAccessTime      : num  443 223 356 344 342 ...
##  $ prb_attempts       : int  185 85 211 142 217 96 83 202 199 222 ...
##  $ prb_correct        : int  119 62 122 93 150 58 72 126 121 120 ...
##  $ prb_totalPoints    : int  128 67 133 97 159 63 74 140 131 132 ...

Visualization of Results

Multiplot Layout

The multiplot function allows for multiple visualizations to be added into a single image. The multiplot function for ggplot2 was taken from the Winston Chang. (2017). Cookbook for R. http://www.cookbook-r.com/.

#Layout
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the  panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }

  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

Visualization themes and parameters

Before visualizing the results, ggplot2 package themes are set. A categorical color palettes is generate using the colorspace package. Labels used for the student groups based on certificate status.

Themes and color Palette

#Theme for ggplot2
theme_set(theme_light())

##Color Scales and setting for graphs
strip <- c("#DCDCDC") #Grey

#Pink/Orange/Yellow
pal2 <- function (n, h = c(-83, 43), c. = c(100, 43), l = c(56, 86), 
                  power = c(0.166666666666667, 0.9), fixup = TRUE, gamma = NULL, 
                  alpha = 1, ...) 
                    {
                      if (!is.null(gamma)) 
                        warning("'gamma' is deprecated and has no effect")
                      if (n < 1L) 
                        return(character(0L))
                      h <- rep(h, length.out = 2L)
                      c <- rep(c., length.out = 2L)
                      l <- rep(l, length.out = 2L)
                      power <- rep(power, length.out = 2L)
                      rval <- seq(1, 0, length = n)
                      rval <- hex(polarLUV(L = l[2L] - diff(l) * rval^power[2L], 
                                           C = c[2L] - diff(c) * rval^power[1L], H = h[2L] - diff(h) * 
                                             rval), fixup = fixup, ...)
                      if (!missing(alpha)) {
                        alpha <- pmax(pmin(alpha, 1), 0)
                        alpha <- format(as.hexmode(round(alpha * 255 + 1e-04)), 
                                        width = 2L, upper.case = TRUE)
                        rval <- paste(rval, alpha, sep = "")
                      }
                      return(rval)
                    }

#Color Scales for Student Certification Cohort Assignments
stdColors <- pal2(3)

Figure Labels

##Label groups
#Certification Groups
userGrps <- as_labeller(c('unq_stu_per' = paste0("All students (", length(students$user_id)," students)"),
                           'unq_stu_per.1'  = paste0("Certificate granted, grades between 100%-70% (",
                                                     length(students[students$grade>=.7,]$user_id)," students)"),  
                           'unq_stu_per.2'  = paste0("No certification but active, with grades less than 70% (",
                                                     length(students[students$grade<.7,]$user_id)," students)")))
userGrps2 <- as_labeller(c('unq_stu_per' = paste0("All students\n(", length(students$user_id)," students)"),
                          'unq_stu_per.1'  = paste0("Certificate granted\n(",
                                                    length(students[students$grade>=.7,]$user_id)," students)"),  
                          'unq_stu_per.2'  = paste0("No certification active\n(",
                                                    length(students[students$grade<.7,]$user_id)," students)")))

Figure 6

6A Student Grades and Number of Events

#6A Grades vs Events (Unq.)
m <- glm(grade ~ poly(events,3), data = students[students$grade>0,])
eq <- substitute(italic(y) == b %.%italic(x)^3* c %.%italic(x)^2* + d %.%italic(x)* + a*","~~italic(r)^2~"="~r2,
                 list( a=format(coef(m)[1], digits=3), 
                       b=format(coef(m)[2], digits=3),
                       c=format(coef(m)[3], digits=3),
                       d=format(coef(m)[4], digits=3),
                       r2=format(1-(m$deviance/m$null.deviance), digits = 3)))
p4 <- ggplot(students[students$grade>0 & abs(scale(students$events))<=5,], aes(x=grade,y=events)) +
        geom_point(aes(color=cert_status)) +
        geom_smooth(method = "glm", formula = y ~ poly(x,3),fullrange=F) +
        geom_text(x = .43, y = 2264, aes(label = eq), data=data.frame(eq=as.character(as.expression(eq))), parse=TRUE) +
        scale_colour_manual(values=stdColors[1:2]) + 
        labs(x="Final Grade",y="Events (within 5 sd. of mean)") +
        theme(legend.position="none")
p4

6B Student Grades and Number of Unique Modules Accessed

#6B Grades vs Mods Accessed (Unq.)
m <- glm(grade ~ poly(mods_unq,3), data = students[students$grade>0,])
eq <- substitute(italic(y) == b %.%italic(x)^3* c %.%italic(x)^2* + d %.%italic(x)* + a*","~~italic(r)^2~"="~r2,
                 list( a=format(coef(m)[1], digits=3), 
                       b=format(coef(m)[2], digits=3),
                       c=format(coef(m)[3], digits=3),
                       d=format(coef(m)[4], digits=3),
                       r2=format(1-(m$deviance/m$null.deviance), digits = 3)))

p3 <- ggplot(students[students$grade>0,], aes(x=grade,y=mods_unq)) +
        geom_point(aes(color=cert_status)) +
        geom_smooth(method = "glm", formula = y ~ poly(x,3),fullrange=F) +
        geom_text(x = .46, y = 258, aes(label = eq), data=data.frame(eq=as.character(as.expression(eq))), parse=TRUE) +
        scale_colour_manual(values=stdColors[1:2]) + 
        labs(x="Final Grade",y="Modules Accessed (Unq.)") +
        theme(legend.position="none")
p3

6C Student Grades and Number of Open Assessment Events

#6C Grades vs Open Assessment Events
m <- glm(grade ~ poly(oa_events,3), data = students[students$grade>0,])
eq <- substitute(italic(y) == b %.%italic(x)^3* c %.%italic(x)^2* + d %.%italic(x)* + a*","~~italic(r)^2~"="~r2,
                 list( a=format(coef(m)[1], digits=3), 
                       b=format(coef(m)[2], digits=3),
                       c=format(coef(m)[3], digits=3),
                       d=format(coef(m)[4], digits=3),
                       r2=format(1-(m$deviance/m$null.deviance), digits = 3)))
p5 <- ggplot(students[students$grade>0 & abs(scale(students$oa_events))<=2,], aes(x=grade,y=oa_events)) +
        geom_point(aes(color=cert_status)) +
        geom_smooth(method = "glm", formula = y ~ poly(x,3),fullrange=F) +
        geom_text(x = .43, y = 61.5, aes(label = eq), data=data.frame(eq=as.character(as.expression(eq))), parse=TRUE) +
        scale_colour_manual(values=stdColors[1:2]) + 
        labs(x="Final Grade",y="Open Assessment Events (within 2 sd. of mean)") +
        theme(legend.position="none")
p5

6D Student Events and Number of Sessions

#6D Events vs Sessions
m <- glm(sessions ~ poly(events,3), data = students[students$grade>0 & abs(scale(students$events))<=5,])
eq <- substitute(italic(y) == b %.%italic(x)^3* c %.%italic(x)^2* + d %.%italic(x)* + a*","~~italic(r)^2~"="~r2,
                 list( a=format(coef(m)[1], digits=3), 
                       b=format(coef(m)[2], digits=3),
                       c=format(coef(m)[3], digits=3),
                       d=format(coef(m)[4], digits=3),
                       r2=format(1-(m$deviance/m$null.deviance), digits = 3)))
p7 <- ggplot(students[students$grade>0 & abs(scale(students$events))<=5,], aes(x=events,y=sessions)) +
        geom_point(aes(color=cert_status)) +
        geom_smooth(method = "glm", formula = y ~ poly(x,3),fullrange=F) +
        geom_text(x =1150, y=145, aes(label = eq), data=data.frame(eq=as.character(as.expression(eq))), parse=TRUE) +
        scale_colour_manual(values=stdColors[1:2]) + 
        labs(x="Events",y="Sessions")+
        theme(legend.position="none")
p7

6E Number of Student Events and Number of Problem Attempts

#6E Events vs Problem Attempts
p10 <- ggplot(students[students$grade>0 & abs(scale(students$events))<=4,], aes(x=events,y=prb_attempts)) +
  geom_point(aes(color=cert_status)) +
  scale_colour_manual(values=stdColors[1:2]) + 
  labs(x="Events",y="Problem Attempts") +
  theme(legend.position="none")
p10

6F Number of Student Events and Number of Open Assessment Events

#6F Events vs OA Events
p11 <- ggplot(students[students$grade>0 & abs(scale(students$events))<=5,], aes(x=events,y=oa_events))  +
        geom_point(aes(color=cert_status)) +
        scale_colour_manual(values=stdColors[1:2]) + 
        labs(x="Events",y="Open Assessment Events") +
        theme(legend.position="none")
p11

Composite of Fig 6

multiplot(p4+ggtitle("A")+ theme(plot.title = element_text(hjust = 0)),
          p3+ggtitle("B")+ theme(plot.title = element_text(hjust = 0)),
          p5+ggtitle("C")+ theme(plot.title = element_text(hjust = 0)),
          p7+ggtitle("D")+ theme(plot.title = element_text(hjust = 0)),
          p10+ggtitle("E")+ theme(plot.title = element_text(hjust = 0)),
          p11+ggtitle("F")+ theme(plot.title = element_text(hjust = 0)),
          cols=2)