[R] text vector clustering

Stefan Th. Gries stgries at gmail.com
Fri Jan 23 17:28:16 CET 2009


Hans-Joerg Bibiko's function Levenshtein would help; cf. below for an
example (very clumsy with two loops, but you can tweak that with apply
stuff).
HTH,
STG


levenshtein <- function(string1, string2, case=TRUE, map=NULL) {
	########
	# levenshtein algorithm in R
	#
	# Author  : Hans-Joerg Bibiko
	# Date    : 29/06/2006
	# Contact : bibiko at eva.mpg.de
	########
	# string1, string2 := strings to compare
	# case = TRUE := case sensitivity; case = FALSE := case insensitivity
	# map := character vector of c(regexp1, replacement1, regexp2,
replacement2, ...)
	#   example:
	#      map <- c("[aeiou]","V","[^aeiou]","C") := replaces all vowels
with V and all others with C
	#      levenshtein("Bank","Bond", map=map)   =>  0
	########
	
	if(!is.null(map)) {
		m <- matrix(map, ncol=2, byrow=TRUE)
		s <- c(ifelse(case, string1, tolower(string1)), ifelse(case,
string2, tolower(string2)))
		for(i in 1:dim(m)[1]) s <- gsub(m[i,1], m[i,2], s)
		string1 <- s[1]
		string2 <- s[2]
	}

	if(ifelse(case, string1, tolower(string1)) == ifelse(case, string2,
tolower(string2))) return(0)

	s1 <- strsplit(paste(" ", ifelse(case, string1, tolower(string1)),
sep=""), NULL)[[1]]
	s2 <- strsplit(paste(" ", ifelse(case, string2, tolower(string2)),
sep=""), NULL)[[1]]
	
	l1 <- length(s1)
	l2 <- length(s2)
	
	d <- matrix(nrow = l1, ncol = l2)

	for(i in 1:l1) d[i,1] <- i-1
	for(i in 1:l2) d[1,i] <- i-1
	for(i in 2:l1) for(j in 2:l2) d[i,j] <- min((d[i-1,j]+1) ,
(d[i,j-1]+1) , (d[i-1,j-1]+ifelse(s1[i] == s2[j], 0, 1)))
	
	d[l1,l2]
} # end of function Hans-Joerg Bibiko's levenshtein

# generate names
set.seed(1)
all.names<-character(10)
for (i in 1:10) {
   all.names[i]<-paste(sample(letters, sample(4:10, 1), replace=T), collapse="")
}
all.names

# generate matrix
sims<-matrix(0, nrow=10, ncol=10)
attr(sims, "dimnames")<-list(all.names, all.names)

# fill matrix (clumsy)
for (j in 1:9) {
   for (k in (j+1):10) {
      sims[j,k]<-sims[k,j]<-levenshtein(all.names[j], all.names[k])
   }
}
plot(hclust(as.dist(sims)))




More information about the R-help mailing list