[R] cophenetic matrix

houix@ircam.fr houix at ircam.fr
Tue Jun 12 11:45:17 CEST 2001


Hello,
I analyse some free-sorting data so I use hierarchical
clustering.
I want to compare my proximity matrix with the tree 
representation to evalute the fitting. (stress, cophenetic correlation
(pearson's correlation)...)

"The cophenetic similarity of two objects a and b is defined as the
similarity level at wich objects a and b become members of the same
cluster during the course of clustering" Legendre, P and Legendre, L Numeri
Ecology, 1998.

I've made a (tricky) function to compute this cophenetic matrix. But now 
I need some help to evaluate my function with your data. Because for
little matrix (as Legendre's example), I found the same cophenetic
correlation  as Matlab do (direct calculation) but for bigger matrix
(30x30) my results differ!!

Maybe my function is wrong?
To use my function you need to have the sm library. 

Thanks 
Olivier Houix  
-- 
Olivier Houix  <houix at ircam.fr>             tel: 01 44 78 15 51
Equipe Perception et Cognition Musicales    http://www.ircam.fr/  
IRCAM   1 place Igor Stravinsky             75004         Paris           
-------------- next part --------------
### cophenetic matrix that represents tree distance
### I use the function ask to interactively input data
### sorry for people that doesn't like this
### the cophenetic matrix is saved with the name:
### filename.arbre
### source("cophmatrix.r")
### 
cophmatrix <- function(){
  library(mva)
  library(sm)
  nom <- ask(message="Name of the file")
  nbre <- ask(message="Number of sounds")
  quest <- ask(message="Matrix of dissimilarity (1) or  nxm matrix (2)")
  fichier <- read.table(nom)
  ## matrix nxn ou nxm 
  ## proximity between objects: nxn or description of n objects with m attributes
  if(quest == 1){
    dis1 <- as.dist(fichier)
  }
  else{
    methode <- ask(message="The distance measure to be used = euclidean:1 maximum:2 manhattan:3 canberra:4 or binary:5")
    methodo <- c("euclidean", "maximum", "manhattan", "canberra" ,"binary")  
    dis1 <- dist(fichier , method = methodo[[methode]])
  }
  methode2 <- ask(message ="The agglomeration method to be used = ward:1 single:2 complete:3 average:4 mcquitty:5 median:6 centroid:7")
  methodo2 <- c("ward", "single", "complete", "average","mcquitty", "median","centroid")
  hc <- hclust(dis1 , method = methodo2[[methode2]])
  merge <- hc$merge
  height <- hc$height
  nodal <- cbind(merge,height) ### layout of the nodes 
  long <-  length(height) + 1
  distarbre <- mat.or.vec(nbre,nbre)

  ## the method is very simple as the program hclust works.
  ## For clusters  merging, I compute the distance between
  ## objects belonging to the differents clusters merged.
  ## If you look at hc$merge, you see that a negative value correspond
  ## to an object and a positive value to a node.
  
  for(i in 1:(long-1)){
    if(i == 1){
      if((nodal[i,1] < 0) &&  (nodal[i,2] < 0)){
        noeud <- list(c(abs(nodal[i,1]),abs(nodal[i,2])))
        distarbre[as.integer(abs(nodal[i,1])),as.integer(abs(nodal[i,2]))] <-  nodal[i,3]
        distarbre[as.integer(abs(nodal[i,2])),as.integer(abs(nodal[i,1]))] <-  nodal[i,3]
      }
      if(((nodal[i,1] < 0) && (nodal[i,2] > 0)) ||  (nodal[i,2] < 0) && (nodal[i,1] > 0)){
        if(nodal[i,1] > 0){          
          noeud <- list(c(unlist(noeud[[as.integer(abs(nodal[i,1]))]]),abs(nodal[i,2])))
          
        }
        else  noeud <- list(c(unlist(noeud[[as.integer(abs(nodal[i,2]))]]),abs(nodal[i,1])))
        
      }      
      if((nodal[i,1] > 0) &&  (nodal[i,2] > 0)){
        noeud <- list(c(unlist(noeud[[abs(nodal[i,1])]]),unlist(noeud[[as.integer(abs(nodal[i,2]))]])))
      }
      
    }
    else{
      if((nodal[i,1] < 0) &&  (nodal[i,2] < 0)){
        noeud <- c(noeud,list(c(abs(nodal[i,1]),abs(nodal[i,2]))))
        distarbre[as.integer(abs(nodal[i,1])),as.integer(abs(nodal[i,2]))] <-  nodal[i,3]
        distarbre[as.integer(abs(nodal[i,2])),as.integer(abs(nodal[i,1]))] <-  nodal[i,3]
      }
      if(((nodal[i,1] < 0) && (nodal[i,2] > 0)) ||  (nodal[i,2] < 0) && (nodal[i,1] > 0)){
        if(nodal[i,1] > 0){          
          for(l in 1:length(noeud[[as.integer(abs(nodal[i,1]))]])){
            print(l)
            distarbre[noeud[[as.integer(abs(nodal[i,1]))]][l],abs(nodal[i,2])] <- nodal[i,3]
            distarbre[abs(nodal[i,2]),noeud[[as.integer(abs(nodal[i,1]))]][l]] <- nodal[i,3] 
          }
          noeud <- c(noeud,list(c((noeud[[as.integer(abs(nodal[i,1]))]]),abs(nodal[i,2]))))
        }
        else{
          print(length(noeud[[as.integer(abs(nodal[i,2]))]]))
          for(m in 1:length(noeud[[as.integer(abs(nodal[i,2]))]])){
            print(m)
            distarbre[noeud[[as.integer(abs(nodal[i,2]))]][m],abs(nodal[i,1])] <- nodal[i,3]
            distarbre[abs(nodal[i,1]),noeud[[as.integer(abs(nodal[i,2]))]][m]] <- nodal[i,3] 
          }
          noeud <- c(noeud,list(c((noeud[[as.integer(abs(nodal[i,2]))]]),abs(nodal[i,1]))))
        }
      }      
      if((nodal[i,1] > 0) &&  (nodal[i,2] > 0)){
        for(n in 1:length(noeud[[as.integer(abs(nodal[i,1]))]])){
          for(o in 1:length(noeud[[as.integer(abs(nodal[i,2]))]])){            
            distarbre[noeud[[as.integer(abs(nodal[i,2]))]][o],noeud[[as.integer(abs(nodal[i,1]))]][n]] <- nodal[i,3]
            distarbre[noeud[[as.integer(abs(nodal[i,1]))]][n],noeud[[as.integer(abs(nodal[i,2]))]][o]] <- nodal[i,3]
          }
        }
        noeud <- c(noeud,list(c((noeud[[as.integer(abs(nodal[i,1]))]]),(noeud[[as.integer(abs(nodal[i,2]))]]))))
      }
    }
  }
  ##print(diag(distarbre))
  ##print(distarbre)
  write.table(distarbre,paste(nom,".arbre",sep=""), quote = FALSE,eol = "\n",row.names = FALSE,col.names=FALSE)
}








More information about the R-help mailing list