[R] graphically representing frequency of words in a speech?

Brown, Tony Nicholas tony.n.brown at Vanderbilt.Edu
Sun Jun 7 23:02:47 CEST 2009


Thank you so much Mark and Gregor. The basic information, suggestions,
and R code that you provided is most helpful. 

Tony

-----Original Message-----
From: Gorjanc Gregor [mailto:Gregor.Gorjanc at bfro.uni-lj.si] 
Sent: Sunday, June 07, 2009 2:17 PM
To: Marc Schwartz; Brown, Tony Nicholas
Cc: rhelp help
Subject: RE: [R] graphically representing frequency of words in a
speech?

> The only thing that I found for R is by Gregor Gorjanc, but the
> information seems to be dated:
>
>    http://www.bfro.uni-lj.si/MR/ggorjan/software/R/index.html#tagCloud

Hi,

Yes, I have tried to create a tag cloud plot in R, but I abandoned the
project
due to other things. The main obstacle was that in R we need to take
care of the fontsizes and placement of words, while this is very easy
with
say browsers, who do all the renderind. I tracked the last version of
the R file
which is pasted bellow. I must say that I do not remember the status of
the
code so use it as you wish. If anyone wishes to take this project
further, please
do so!

gg

### tagCloud.R
###---------------------------------------------------------------------
---
### What: Tag cloud plot functions
### Time-stamp: <2006-09-10 02:53:29 ggorjan>
###---------------------------------------------------------------------
---

tagCloud <- function(x, n=100, decreasing=TRUE,
                     threshold=NULL, fontsize=c(12, 36),
                     align=TRUE, expandRow=TRUE,
                     justRow="bottom", title,
                     textGpar=gpar(col="navy"),
                     rectGpar=gpar(col="white"),
                     titleGpar=gpar(), viewGpar=gpar(),
                     mar=c(1, 1, 1, 1))
{
  UseMethod("tagCloud")
}

tagCloud.default <- function(x, n=100, decreasing=TRUE,
                             threshold=NULL, fontsize=c(12, 36),
                             align=TRUE, expandRow=TRUE,
                             justRow="bottom", title,
                             textGpar=gpar(col="navy"),
                             rectGpar=gpar(col="white"),
                             titleGpar=gpar(), viewGpar=gpar(),
                             mar=c(1, 1, 1, 1))
{
  if(!is.null(dim(x))) stop("'x' must be a vector")

  tagCloud.table(table(x), n=n, decreasing=decreasing,
fontsize=fontsize,
                 threshold=threshold, align=align, expandRow=expandRow,
                 justRow=justRow, title=title, textGpar=textGpar,
                 rectGpar=rectGpar, titleGpar=titleGpar,
viewGpar=viewGpar,
                 mar=mar)
}

tagCloud.table <- function(x, n=100, decreasing=TRUE,
                           threshold=NULL, fontsize=c(12, 36),
                           align=TRUE, expandRow=TRUE,
                           justRow="bottom", title,
                           textGpar=gpar(col="navy"),
                           rectGpar=gpar(col="white"),
                           titleGpar=gpar(), viewGpar=gpar(),
                           mar=c(1, 1, 1, 1))
{
  ## --- Check ---

  if(length(dim(x)) != 1)
    stop("'x' must be one dimensional table")

  ## --- Threshold ---

  if(!is.null(threshold)) x <- x[x >= threshold]

  ## --- Number of units ---

  N <- length(x)                ## length of table
  if(is.null(n)) {              ## if n=NULL, plot all units
    n <- N
  } else {
    if(n > N) n <- N            ## if n is to big, decrease it
    if(n < 1) n <- round(N * n) ## if n is percentage of units
  }

  fontsizeLength <- length(fontsize)
  if(fontsizeLength != 2)
    stop("'fontsize' must be of length two")

  ## --- Sort and subset ---

  if(n < N) { ## only if we want to plot subset of units
    tmp <- sort(x, decreasing=decreasing)
    x <- x[names(x) %in% names(tmp[1:n])]
  }

  ## --- Get relative freq ---

  x <- prop.table(x)

  ## --- Fontsize ---

  fontsizeDiff <- diff(fontsize)
  xDiff <- max(x) - min(x)
  if(xDiff != 0) {
    off <- ifelse(fontsizeDiff > 0, min(x), max(x))
    fontsize <- (x - off) / xDiff * fontsizeDiff + min(fontsize)
  } else { ## all units have the same frequency
    fontsize <- rep(min(fontsize), times=n)
  }

  ## --- Viewport and rectangle ---

  grid.newpage()
  width <- unit(1, "npc")
  height <- unit(1, "npc")
  vp <- viewport(y=unit(mar[1], "lines"), x=unit(mar[2], "lines"), ,
                 width=width - unit(mar[2] + mar[4], "lines"),
                 height=height - unit(mar[1] + mar[3], "lines"),
                 just=c("left", "bottom"), gp=viewGpar, name="main")
  pushViewport(vp)

  if(!missing(title))
    grid.text(title, y=height, gp=titleGpar, name="title")

  grid.rect(gp=rectGpar, name="cloud")

  ## --- Grobs ---

  tag <- vector(mode="list", length=4)
  names(tag) <- c("fontsize", "grob", "width", "height")
  tag[[1]] <- tag[[2]] <- tag[[3]] <- tag[[4]] <- vector(mode="list",
length=n)
  for(i in 1:n) {
    tag$fontsize[[i]] <- fontsize[i]
    tag$grob[[i]] <- textGrob(names(x[i]),
gp=gpar(fontsize=fontsize[i]))
    tag$width[[i]] <- convertWidth(grobWidth(tag$grob[[i]]),
unitTo="npc",
                                   valueOnly=TRUE)
    tag$height[[i]] <- convertHeight(grobHeight(tag$grob[[i]]),
unitTo="npc",
                                     valueOnly=TRUE)
  }

  ## --- Split lines ---

  row <- colWidth <- vector(length=n)
  row[1] <- 1
  colWidth[1] <- 0
  lineWidth <- tag$width[[1]]
  j <- 1
  gapWidth <- convertWidth(stringWidth(" "), unitTo="npc",
valueOnly=TRUE)
  maxWidth <- convertWidth(width, unitTo="npc", valueOnly=TRUE)

  for(i in 2:length(tag$width)) {
    test <- lineWidth + gapWidth + tag$width[[i]]
    if(test < maxWidth) {
      row[i] <- row[i - 1]
      colWidth[i] <- lineWidth + gapWidth
      lineWidth <- test
      j <- j + 1
    } else {
      if(align) { ## Align units in previous row
        free <- maxWidth - lineWidth
        if(j == 1) {
          colWidth[i - 1] <- maxWidth / 2 -  tag$width[[i - 1]] / 2
        } else {
          gapWidthAlign <- free / j
          start <- i - (j - 1)
          end <- start + j - 2
          colWidth[start:end] <- colWidth[start:end] +
            cumsum(rep(gapWidthAlign, times=(j - 1)))
        }
      }
      row[i] <- row[i - 1] + 1
      lineWidth <- tag$width[[i]]
      colWidth[i] <- 0
      j <- 1
    }
  }

  rowHeight <- tapply(unlist(tag$height), list(row), max)

  ## --- Is there to many rows for given dimension of a rectangle ---

  sumRowHeight <- sum(rowHeight)
  heightNum <- convertWidth(height, unitTo="npc", valueOnly=TRUE)
  if(sumRowHeight > heightNum) {
    msg <- c("can not fit into defined dimension;",
             "adjust dimension, fontsize or number of units;",
             "keeping else constant, height should be at least",
             sumRowHeight)
    stop(cat(msg, fill=TRUE))
  } else {
    if(expandRow) { ## increase height of row to fit nicely
      heightDiff <- heightNum - sumRowHeight
      heightDiff <- heightDiff / max(row)
      rowHeight <- rowHeight + heightDiff
    }
  }

  ## We have to plot from top to bottom and text should be in the bottom
  ## or center of the line
  rowHeightCenter<- ifelse(justRow == "bottom", 0, rowHeight / 2)
  rowHeight <- heightNum - (cumsum(rowHeight) - rowHeightCenter)

  rowHeight <- rowHeight[row]

  textGpar$fontsize <- unlist(tag$fontsize)
  grid.text(label=names(x), gp=textGpar,
            x=unit(colWidth, units="npc"),
            y=unit(rowHeight, units="npc"), just=c("left", justRow),
            name="tag")
}

## getNames()
## grid.edit("tag", gp=gpar(col="red"))

###---------------------------------------------------------------------
---
### tagCloud.R ends here




More information about the R-help mailing list