Purpose

I have been coding R for the past three years. Want to do a historical analysis of R programs and come out with some basic summary statistics

Data Preparation

> temp.dir <- "C:/Cauldron/garage/garbage/Volatility"
> temp.files <- data.frame(files = list.files(temp.dir, recursive = T,
     full.names = TRUE))
> temp.files$files <- as.character(temp.files$files)
> condition <- ifelse(regexpr("(.*?).(R)$", temp.files[, 1]) ==
     -1, FALSE, TRUE)
> temp.files <- temp.files[condition, , drop = F]
> temp.files$size <- 0
> for (j in seq_along(temp.files[, 1])) {
     con <- file(temp.files[j, 1], "r", blocking = FALSE)
     z <- readLines(temp.files[j, 1])
     temp.files$size[j] <- length(z)
     close(con)
 }
> order.idx <- order(temp.files$size, decreasing = T)
> temp.files <- temp.files[order.idx, ]
> r.program.count <- dim(temp.files)[1]
> interested <- c(temp.files[temp.files$size < 500, 2])
> r.interested.pg.count <- length(interested)

There are about 1704 files in the folder and out of which 1690 are the relevant ones which contain less than 500 lines of code.

Distribution of lines of code

> quantile(interested, probs = seq(0, 1, 0.1))
   0%   10%   20%   30%   40%   50%   60%   70%   80%   90%  100%
  1.0  14.0  26.0  39.0  56.0  75.0  97.0 133.0 184.2 247.1 489.0

The above shows that my average length of program is 75 lines of code

Here is a basic plot of lines of code

> par(mfrow = c(1, 2))
> hist(interested, main = "Lines of code", xlab = "lines", col = "blue")
> boxplot(interested, main = "Lines of code", col = "blue")

My_R_Programs_Blog-003.jpg

The other thing I am interested is in vintage analysis.

> library(xts)
> z <- readLines("output.txt")
> status <- ifelse(regexpr("^(0[1-9]|1[012]/)", z) == -1, FALSE,
     TRUE)
> z <- z[status]
> status <- ifelse(regexpr("(.*?).(R)$", z) == -1, FALSE, TRUE)
> z <- z[status]
> z1 <- strsplit(z, " ")
> remove.spaces <- function(x) {
     condition <- x == ""
     x <- x[!condition]
     return(x[1:3])
 }
> z2 <- lapply(z1, remove.spaces)
> z3 <- do.call(rbind, z2)
> z4 <- as.data.frame(z3)
> z4$date.touch <- as.Date(z4[, 1], format = "%m/%d/%Y")
> z4$count <- 1
> z5 <- as.data.frame(tapply(z4$count, z4$date.touch, sum))
> result <- data.frame(dt = as.character(rownames(z5)), count = z5[,
     1])
> rownames(result) <- NULL
> result$dt2 <- as.Date((result$dt), format = "%Y-%m-%d")
> result <- result[, c(3, 2)]
> result <- result[order(result[, 1]), ]
> result.xts <- xts(result[, 2], result[, 1])
> par(mfrow = c(1, 1))
> result.sanitized.xts <- Reclass(result.xts[result.xts[, 1] <
     20, ])[-c(1, 2), ]
> temp <- data.frame(count = coredata(result.sanitized.xts), mon.yr = format(index(result.sanitized.xts),
     "%Y-%m-01"))
> temp <- cbind(tapply(temp$count, temp$mon.yr, sum))
> temp <- as.xts(temp)
> plot(temp, main = "Timeseries of R programs ", col = "blue",
     type = "h", lwd = 3)

My_R_Programs_Blog-004.jpg

Monthly Distribution

Given this dataset, what are the interesting questions that one can seek out ? What is monthly distribution of programs coded

> temp <- data.frame(count = coredata(result.sanitized.xts), month = format(index(result.sanitized.xts),
     "%m"))
> tapply(temp$count, temp$month, sum)
 01  02  03  04  05  06  07  08  09  10  11  12
134  92  57  41  35 141  82 167 106 162 165  98
> months <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "July",
     "Aug", "Sept", "Oct", "Nov", "Dec")
> month.f <- factor(months, levels = months)
> plot(month.f, tapply(temp$count, temp$month, sum), main = "Monthly distribution of programs coded",
     col = "blue", type = "h", lwd = 1)

My_R_Programs_Blog-007.jpg

Yearly Distribution

> temp <- data.frame(count = coredata(result.sanitized.xts), yr = format(index(result.sanitized.xts),
     "%Y"))
> tapply(temp$count, temp$yr, sum)
2008 2009 2010 2011 2012
   6  302  432  492   48
> yrs <- c(2008, 2009, 2010, 2011, 2012)
> plot(yrs, tapply(temp$count, temp$yr, sum), main = "Yearly distribution of programs coded",
     type = "h", ylab = "count", , col = "blue", lwd = 3)

My_R_Programs_Blog-009.jpg

With outliers I have coded around 400 programs in each of 2010 and 2011 and another 300 in 2009