cluster analysis - R warning - longer object length is not a multiple of shorter object length -
this question has answer here:
i'm trying perform clustering on spatial data based on distance constrain cluster size. found article online, (spatial clustering equal sizes), , works small list of date 3 clusters.
however, when tried run larger list , cluster them 30 clusters, doesn't work expected. clusters returns uneven again, below.
i tried smaller data 30 cluster , example dataset, both worked out evenly. guess it's wrong data. i'm not sure how fix it.
table( cl_constrain$cluster )
cluster 1 2 3 4 5 6 7 8 9 10
size 151 63 67 88 65 89 92 82 72 84cluster 11 12 13 14 15 16 17 18 19 20
size 60 61 44 46 60 51 65 216 56 188cluster 20 21 22 23 24 25 26 27 28 29 30
size 229 78 101 75 196 70 222 62 102 271
my data set looks this
i'm new r, , not sure what's going wrong it, me out please? lot!
here's source code article.
# convert radian as_radians = function(theta=0){ return(theta * pi / 180) } calc_dist = function(fr, to) { lat1 = as_radians(fr$lat) lon1 = as_radians(fr$lon) lat2 = as_radians(to$lat) lon2 = as_radians(to$lon) = 3963.191; b = 3949.903; numerator = ( a^2 * cos(lat2) )^2 + ( b^2 * sin(lat2) ) ^2 denominator = ( * cos(lat2) )^2 + ( b * sin(lat2) )^2 radiusofearth = sqrt(numerator/denominator) #accounts ellipticity of earth. d = radiusofearth * acos( sin(lat1) * sin(lat2) + cos(lat1)*cos(lat2)*cos(lon2 - lon1) ) d.return = list(distance_miles=d) return(d.return) } raw.og = read.csv("http://statistical-research.com/wp-content/uploads/2013/11/sample_geo.txt", header=t, sep="\t") orig.data = raw.og[,1:3] dirichletclusters_constrained = function(orig.data, k=5, max.iter =50, tolerance = 1, plot.iter=true) { fr = = null r.k.start = sample(seq(1:k)) n = nrow( orig.data ) k.size = ceiling(n/k) initial.clusters = rep(r.k.start, k.size) if(n%%length(initial.clusters)!=0){ exclude.k = length(initial.clusters) - n%%length(initial.clusters) } else { exclude.k = 0 } orig.data$cluster = initial.clusters[1:(length(initial.clusters)-exclude.k)] orig.data$cluster_original = orig.data$cluster ## calc centers , merge mu = cbind( by(orig.data$latitude, orig.data$cluster, mean), by(orig.data$longitude, orig.data$cluster, mean), seq(1:k) ) tmp1 = matrix( match(orig.data$cluster, mu[,3]) ) orig.data.centers = cbind(as.matrix(orig.data), mu[tmp1,])[,c(1:2,4:6)] ## calc initial distance centers fr$lat = orig.data.centers[,3]; fr$lon = orig.data.centers[,4] to$lat = orig.data.centers[,1]; to$lon = orig.data.centers[,2] orig.data$distance.from.center = calc_dist(fr, to)$distance_miles orig.data$distance.from.center_original = orig.data$distance.from.center ## set initial configuration values is.converged = false iteration = 0 error.old = inf error.curr = inf while ( !is.converged && iteration < max.iter ) { # iterate until threshold or maximum iterations if(plot.iter==true){ plot(orig.data$longitude, orig.data$latitude, col=orig.data$cluster, pch=16, cex=.6, xlab="longitude",ylab="latitude") } iteration = iteration + 1 start.time = as.numeric(sys.time()) cat("iteration ", iteration,sep="") for( in 1:n ) { # iterate on each observation , measure distance each observation' mean center # produces exchange. takes observation closest it's mean , in return gives observation # closest giver, k, mean fr = = distances = null for( j in 1:k ){ # determine distance each k group fr$lat = orig.data$latitude[i]; fr$lon = orig.data$longitude[i] to$lat = mu[j,1]; to$lon = mu[j,2] distances[j] = as.numeric( calc_dist(fr, to) ) } # k cluster observation closest. which.min.distance = which(distances==min(distances), arr.ind=true) previous.cluster = orig.data$cluster[i] orig.data$cluster[i] = which.min.distance # replace cluster closest cluster # trade observation closest giving cluster if(previous.cluster != which.min.distance){ new.cluster.group = orig.data[orig.data$cluster==which.min.distance,] fr$lat = mu[previous.cluster,1]; fr$lon = mu[previous.cluster,2] to$lat = new.cluster.group$latitude; to$lon = new.cluster.group$longitude new.cluster.group$tmp.dist = calc_dist(fr, to)$distance_miles take.out.new.cluster.group = which(new.cluster.group$tmp.dist==min(new.cluster.group$tmp.dist), arr.ind=true) locationid = new.cluster.group$locationid[take.out.new.cluster.group] orig.data$cluster[orig.data$locationid == locationid] = previous.cluster } } # calculate new cluster means mu = cbind( by(orig.data$latitude, orig.data$cluster, mean), by(orig.data$longitude, orig.data$cluster, mean), seq(1:k) ) tmp1 = matrix( match(orig.data$cluster, mu[,3]) ) orig.data.centers = cbind(as.matrix(orig.data), mu[tmp1,])[,c(1:2,4:6)] mu = cbind( by(orig.data$latitude, orig.data$cluster, mean), by(orig.data$longitude, orig.data$cluster, mean), seq(1:k) ) ## calc initial distance centers fr$lat = orig.data.centers[,3]; fr$lon = orig.data.centers[,4] to$lat = orig.data.centers[,1]; to$lon = orig.data.centers[,2] orig.data$distance.from.center = calc_dist(fr, to)$distance_miles # test convergence. previous distance within threshold of current total distance center error.curr = sum(orig.data$distance.from.center) error.diff = abs( error.old - error.curr ) error.old = error.curr if( !is.nan( error.diff ) && error.diff < tolerance ) { is.converged = true } # set time see how long process take going through iterations stop.time = as.numeric(sys.time()) hour.diff = (((stop.time - start.time) * (max.iter - iteration))/60)/60 cat("\n error ",error.diff," hours remain iterations ",hour.diff,"\n") # write out iterations. can later used starting point if iterations need pause write.table(orig.data, paste("c:\\optimize_iteration_",iteration,"_instore_data.csv", sep=""), sep=",", row.names=f) } centers = data.frame(mu) ret.val = list("centers" = centers, "cluster" = factor(orig.data$cluster), "locationid" = orig.data$locationid, "latitude" = orig.data$latitude, "longitude" = orig.data$longitude, "k" = k, "iterations" = iteration, "error.diff" = error.diff) return(ret.val) } # constrained clustering cl_constrain = dirichletclusters_constrained(orig.data, k=4, max.iter=5, tolerance=.0001, plot.iter=true) table( cl_constrain$cluster ) plot(cl_constrain$longitude, cl_constrain$latitude, col=cl_constrain$cluster, pch=16, cex=.6, xlab="longitude",ylab="latitude") library(maps) map("state", add=t) points(cl_constrain$centers[,c(2,1)], pch=4, cex=2, col='orange', lwd=4)
there same-size cluster k-means variation in elki.
it explained in detail in this tutorial.
i have seen lot of people ask such clustering algorithm, not think supported theory use algorithm this.
for use case, have problem of geographic coordinates: k-means uses mean, mean may inconsistent distance function. consider 2 points @ longitude -179° , +178°. k-means use mean of these two, -0.5° cluster center. more sensible choice of cluster center @ +179.5°, on opposite side of earth.
if data constrained reasonably small area, may still work. better quality, may want map data appropriate utm zone. within 1 utm zone, euclidean distance reasonable approximation of distance.
Comments
Post a Comment