Deliberate Practice Metrics - March 2012 Report Card
Purpose
I have been coding R for the past three years. Want to do a historical analysis of R programs and come out with some basic summary statistics
Data Preparation
> temp.dir <- "C:/Cauldron/garage/garbage/Volatility" > temp.files <- data.frame(files = list.files(temp.dir, recursive = T, full.names = TRUE)) > temp.files$files <- as.character(temp.files$files) > condition <- ifelse(regexpr("(.*?).(R)$", temp.files[, 1]) == -1, FALSE, TRUE) > temp.files <- temp.files[condition, , drop = F] > temp.files$size <- 0 > for (j in seq_along(temp.files[, 1])) { con <- file(temp.files[j, 1], "r", blocking = FALSE) z <- readLines(temp.files[j, 1]) temp.files$size[j] <- length(z) close(con) } > order.idx <- order(temp.files$size, decreasing = T) > temp.files <- temp.files[order.idx, ] > r.program.count <- dim(temp.files)[1] > interested <- c(temp.files[temp.files$size < 500, 2]) > r.interested.pg.count <- length(interested) |
There are about 1704 files in the folder and out of which 1690 are the relevant ones which contain less than 500 lines of code.
Distribution of lines of code
> quantile(interested, probs = seq(0, 1, 0.1)) 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100% 1.0 14.0 26.0 39.0 56.0 75.0 97.0 133.0 184.2 247.1 489.0 |
The above shows that my average length of program is 75 lines of code
Here is a basic plot of lines of code
> par(mfrow = c(1, 2)) > hist(interested, main = "Lines of code", xlab = "lines", col = "blue") > boxplot(interested, main = "Lines of code", col = "blue") |
The other thing I am interested is in vintage analysis.
> library(xts) > z <- readLines("output.txt") > status <- ifelse(regexpr("^(0[1-9]|1[012]/)", z) == -1, FALSE, TRUE) > z <- z[status] > status <- ifelse(regexpr("(.*?).(R)$", z) == -1, FALSE, TRUE) > z <- z[status] > z1 <- strsplit(z, " ") > remove.spaces <- function(x) { condition <- x == "" x <- x[!condition] return(x[1:3]) } > z2 <- lapply(z1, remove.spaces) > z3 <- do.call(rbind, z2) > z4 <- as.data.frame(z3) > z4$date.touch <- as.Date(z4[, 1], format = "%m/%d/%Y") > z4$count <- 1 > z5 <- as.data.frame(tapply(z4$count, z4$date.touch, sum)) > result <- data.frame(dt = as.character(rownames(z5)), count = z5[, 1]) > rownames(result) <- NULL > result$dt2 <- as.Date((result$dt), format = "%Y-%m-%d") > result <- result[, c(3, 2)] > result <- result[order(result[, 1]), ] > result.xts <- xts(result[, 2], result[, 1]) > par(mfrow = c(1, 1)) > result.sanitized.xts <- Reclass(result.xts[result.xts[, 1] < 20, ])[-c(1, 2), ] > temp <- data.frame(count = coredata(result.sanitized.xts), mon.yr = format(index(result.sanitized.xts), "%Y-%m-01")) > temp <- cbind(tapply(temp$count, temp$mon.yr, sum)) > temp <- as.xts(temp) > plot(temp, main = "Timeseries of R programs ", col = "blue", type = "h", lwd = 3) |
Monthly Distribution
Given this dataset, what are the interesting questions that one can seek out ? What is monthly distribution of programs coded
> temp <- data.frame(count = coredata(result.sanitized.xts), month = format(index(result.sanitized.xts), "%m")) > tapply(temp$count, temp$month, sum) 01 02 03 04 05 06 07 08 09 10 11 12 134 92 57 41 35 141 82 167 106 162 165 98 |
> months <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "July", "Aug", "Sept", "Oct", "Nov", "Dec") > month.f <- factor(months, levels = months) > plot(month.f, tapply(temp$count, temp$month, sum), main = "Monthly distribution of programs coded", col = "blue", type = "h", lwd = 1) |
Yearly Distribution
> temp <- data.frame(count = coredata(result.sanitized.xts), yr = format(index(result.sanitized.xts), "%Y")) > tapply(temp$count, temp$yr, sum) 2008 2009 2010 2011 2012 6 302 432 492 48 |
> yrs <- c(2008, 2009, 2010, 2011, 2012) > plot(yrs, tapply(temp$count, temp$yr, sum), main = "Yearly distribution of programs coded", type = "h", ylab = "count", , col = "blue", lwd = 3) |
With outliers I have coded around 400 programs in each of 2010 and 2011 and another 300 in 2009