# Clustering Examples library(mlbench) library(fpc) library(cluster) library(mclust) library(dbscan) ########################################################### # simulate data set.seed(665544) n <- 600 # Simulate 10 centers with normal random error x <- cbind(runif(10, 0, 10)+rnorm(n, sd=0.2), runif(10, 0, 10)+rnorm(n, sd=0.2)) plot(x) x <- scale(x) plot(x) par(bg="grey80") # Using the mlbench function to simulate normals in 2d library(mlbench) set.seed(665544) # 2 classes p <- mlbench.2dnormals(n=500, cl=2, sd=0.25) plot(p) # 10 classes p <- mlbench.2dnormals(n=500, cl=10, sd=0.25) plot(p) # x <- p$x # Remove to try other simulated data. ########################################################### # kmeans km <- kmeans(x, centers=10, nstart=20) plot(x, col=km$cluster) points(km$centers, pch=3, cex=2) # this adds the centroids text(km$centers, labels=1:10, pos=2) # this adds the cluster ID clusplot(x, km$cluster) km$centers def.par <- par(no.readonly = TRUE) # save default, for resetting... layout(t(1:10)) # 4 plots in one for(i in 1:10) barplot(km$centers[i,], ylim=c(-2,2), main=paste("Cluster", i)) ########################################################### # Mclust m <- Mclust(x) summary(m) par(mfrow=c(1,1)) plot(m, what = "classification") m <- Mclust(x, G=10) summary(m) par(mfrow=c(1,1)) plot(m, what = "classification") ########################################################### # dbscan kNNdistplot(x, k = 3) abline(h=.14, col="red") ds <- dbscan(x, 0.14) # run with showplot=1 to see how dbscan works. ds plot(x, col=ds$cluster+1L) clusplot(x, ds$cluster) ########################################################### # Big Data Analytics # reference: http://artax.karlin.mff.cuni.cz/r-help/library/biganalytics/html/bigkmeans.html # Simple example (with one processor): library(biganalytics) # simulated data x <- big.matrix(1000000, 3, init=0, type="double") x[seq(1,1000000,by=2),] <- rnorm(1500000) x[seq(2,1000000,by=2),] <- rnorm(1500000, 5, 1) ans <- bigkmeans(x, 1) # One cluster isn't always allowed # but is convenient. ans$centers ans$withinss ans$size apply(x, 2, mean) ans <- bigkmeans(x, 2, nstart=5) # Sequential multiple starts. class(ans) names(ans) ans$centers ans$withinss ans$size # To use a parallel backend, try something like the following, # assuming you have at least 3 cores available on this machine. # Each processor does incur memory overhead for the storage of # cluster memberships. library(doSNOW) cl <- makeCluster(4) registerDoSNOW(cl) ans <- bigkmeans(x, 2, nstart=5) # Both the following are run iteratively, but with less memory overhead # using bigkmeans(). Note that the gc() comparisons aren't completely # fair, because the big.matrix objects aren't reflected in the gc() # summary. But the savings is there. gc(reset=TRUE) time.new <- system.time(print(bigkmeans(x, 2, nstart=5)$centers)) gc() y <- x[,] rm(x) gc(reset=TRUE) time.old <- system.time(print(kmeans(y, 2, nstart=5)$centers)) gc() # The new kmeans() centers should match the old kmeans() centers, without # the memory overhead amd running more quickly. time.new time.old stopCluster(cl)