[R] Help with Data Transformation

Guy Jett GJett at itsi.com
Mon Jan 10 23:20:18 CET 2011


Thank you John,
I have put a copy of the dput file below my signature block.  I hope that is what you need as I am unfamiliar with that function.  Note that empty cells need to become "NA".
The "=" character is part of the "PARVALUE" column.
Yours,
Guy
gjett at itsi.com

structure(list(X = c(2268L, 2269L, 2270L, 2272L, 2273L, 2274L, 
2275L, 2276L, 2289L, 2290L, 2291L, 2292L, 2293L, 2294L, 2295L, 
2296L, 2326L, 2327L, 2328L, 2329L, 2330L, 2331L, 2332L, 2346L, 
2349L, NA, NA, NA, NA, NA), fldsampid = structure(c(3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 2L, 3L, 4L), .Label = c("", "fldsampid", 
"LHR020GW-01E2", "LHR020SD-00E2"), class = "factor"), CLP_ID = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 2L, 1L, 3L), .Label = c("", 
"CLP_ID", "MY77J8"), class = "factor"), sacode = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 3L, 2L, 2L), .Label = c("", 
"N", "sacode"), class = "factor"), matrix = structure(c(4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 2L, 4L, 3L), .Label = c("", 
"matrix", "SE", "WG"), class = "factor"), etc. = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), .Label = c("", 
"etc."), class = "factor"), prccode = structure(c(4L, 4L, 4L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 8L, 5L, 6L, 7L, 
5L, 5L, 5L, 5L, 5L, 6L, 1L, 1L, 3L, 2L, NA), .Label = c("", "<value>", 
"CL", "INO", "MET", "MI", "ORG", "SN"), class = "factor"), Lab = structure(c(4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 5L, 2L, NA), .Label = c("", 
"<value>", "A4SW", "BRLS", "PO4"), class = "factor"), EXMCODE = structure(c(5L, 
5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 5L, 5L, 5L, 5L, 5L, 4L, 
4L, 5L, 4L, 4L, 4L, 4L, 7L, 4L, 1L, 1L, 6L, 2L, NA), .Label = c("", 
"<value>", "FLDFLT", "METHOD", "NONE", "SO4", "SW3050B"), class = "factor"), 
    Analysis = structure(c(13L, 13L, 13L, 10L, 11L, 11L, 11L, 
    11L, 11L, 11L, 11L, 3L, 3L, 3L, 5L, 13L, 9L, 8L, 15L, 7L, 
    12L, 12L, 12L, 14L, 4L, 1L, 1L, 6L, 2L, 2L), .Label = c("", 
    "<value>", "A2320", "A2540G", "A5310B", "AG", "C245.5", "E160.3", 
    "E1630", "E1631", "E1638", "E200.8", "E300", "SW6010B", "SW9060"
    ), class = "factor"), PARLABEL = structure(c(10L, 16L, 17L, 
    12L, 3L, 4L, 8L, 9L, 20L, 21L, 15L, 5L, 6L, 7L, 11L, 14L, 
    13L, 18L, 19L, 12L, 3L, 8L, 9L, 20L, 18L, 1L, 1L, 4L, 2L, 
    2L), .Label = c("", "<value>", "AG", "AL", "ALK", "ALKB", 
    "ALKC", "AS", "B", "CL", "DOC", "HG", "MEHG", "NO3", "PB", 
    "PO4", "SO4", "SOLID", "TOC", "V", "Zn"), class = "factor"), 
    PARVQ = structure(c(3L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    5L, 3L, 3L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 5L, 3L, 3L, 3L, 3L, 
    1L, 1L, 4L, 2L, 2L), .Label = c("", "<value>", "=", "AS", 
    "ND"), class = "factor"), Result = structure(c(12L, 20L, 
    11L, 3L, 10L, 8L, 14L, 26L, 9L, 5L, 4L, 25L, 25L, 13L, 19L, 
    24L, 15L, 18L, 16L, 21L, 6L, 22L, 7L, 23L, 17L, 1L, 1L, 27L, 
    2L, 2L), .Label = c("", "<value>", "0.00171", "0.008", "1.76", 
    "1050", "11400", "122", "131", "2.57", "22460", "23590.9", 
    "2500", "317", "4.28", "4.823", "47.7", "48.45", "49330", 
    "50", "5100", "5500", "56900", "792", "807000", "9970", "B"
    ), class = "factor"), X.1 = structure(c(1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, 2L), .Label = c("", "<value>", 
    "V"), class = "factor"), X.2 = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "Zn"), class = "factor"), X.3 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, 2L), .Label = c("", 
    "<value>", "etc."), class = "factor"), X.4 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "ALK"), class = "factor"), X.5 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "ALKB"), class = "factor"), X.6 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "ALKC"), class = "factor"), X.7 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, 2L), .Label = c("", 
    "<value>", "SOLID"), class = "factor"), X.8 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "DOC"), class = "factor"), X.9 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "TOC"), class = "factor"), X.10 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "NO3"), class = "factor")), .Names = c("X", "fldsampid", 
"CLP_ID", "sacode", "matrix", "etc.", "prccode", "Lab", "EXMCODE", 
"Analysis", "PARLABEL", "PARVQ", "Result", "X.1", "X.2", "X.3", 
"X.4", "X.5", "X.6", "X.7", "X.8", "X.9", "X.10"), class = "data.frame", row.names = c(NA, 
-30L))


-----Original Message-----
From: John Kane [mailto:jrkrideau at yahoo.ca] 
Sent: Monday, January 10, 2011 1:43 PM
To: r-help at r-project.org; Guy Jett
Subject: Re: [R] Help with Data Transformation

That sample data set is really hard to read.  Could you resent it after having used dput on it?  

A file output with dput is easily read into R and makes seeing what you need much easier.  BTW what are the = doing?

Thanks



--- On Mon, 1/10/11, Guy Jett <GJett at itsi.com> wrote:

> From: Guy Jett <GJett at itsi.com>
> Subject: [R] Help with Data Transformation
> To: "r-help at r-project.org" <r-help at r-project.org>
> Received: Monday, January 10, 2011, 3:59 PM Greetings, I am new to R 
> and am having trouble with parsing a file with the following 
> characteristics:
> 
> *         Individual results
> for a single sample are written to multiple lines.
> 
> *         First 16 columns
> are constant from sample to sample.
> 
> *         Remaining 10 need
> to be matched up (cross-tabbed?)
> 
> o   (the exact contents for the remaining 10 vary from sample to 
> sample, as indicated in the extract
> below)
> 
> *         Ultimate goal is to
> run various comparisons between the variable columns, compare samples 
> from separate populations, and graph samples from the separate 
> populations.
> 
> *         (An extract is
> provided below)
> 
> The data is initially extracted from an SQL database into Excel, then 
> saved as a tab-delimited text file for use in R.
> I have been successful in using subset() to extract specific sample 
> types, but have not yet been able to transform the data so that all 
> the data needed is on a single line.  I have looked at several R 
> manuals, read through 'R in a Nutshell', prowled the help resources (R 
> Site Search and the Google link), tried stack(), subset(), reshape(), 
> and several other functions, to no avail.
> 
> Thank you very much for your help.  This seems like a wonderful 
> community, Guy Jett, R.G.
> Project Geologist
> gjett at itsi.com<mailto:gjett at itsi.com>
> 
> Example Data Input (subset):
>                
> fldsampid
> CLP_ID  sacode
> matrix   etc...
> prccode
>   Lab
>    EXMCODE
>    Analysis
>       PARLABEL
>         PARVQ Result
> 2268       LHR020GW-01E2
>                
>              N
>            WG
>                
>       INO
> BRLS      NONE    E300
>    CL
>    =
>    23590.9
> 2269       LHR020GW-01E2
>                
>              N
>            WG
>                
>       INO
> BRLS      NONE    E300
>    PO4        ND
>         50
> 2270       LHR020GW-01E2
>                
>              N
>            WG
>                
>       INO
> BRLS      NONE    E300
>    SO4        =
>            22460
> 2272       LHR020GW-01E2
>                
>              N
>            WG
>                
>       MET
>    BRLS      FLDFLT
> E1631    HG
> =
>    0.00171
> 2273       LHR020GW-01E2
>                
>              N
>            WG
>                
>       MET
>    BRLS      FLDFLT
> E1638    AG
> =             2.57
> 2274       LHR020GW-01E2
>                
>              N
>            WG
>                
>       MET
>    BRLS      FLDFLT
> E1638    AL
>    =
>    122
> 2275       LHR020GW-01E2
>                
>              N
>            WG
>                
>       MET
>    BRLS      FLDFLT
> E1638    AS
>    =
>    317
> 2276       LHR020GW-01E2
>                
>              N
>            WG
>                
>       MET
>    BRLS      FLDFLT
> E1638    B
>    =
>    9970
> 2289       LHR020GW-01E2
>                
>              N
>            WG
>                
>       MET
>    BRLS      FLDFLT
> E1638    V
>    =
>    131
> 2290       LHR020GW-01E2
>                
>              N
>            WG
>                
>       MET
>    BRLS      FLDFLT
> E1638    Zn
>    =
>    1.76
> 2291       LHR020GW-01E2
>                
>              N
>            WG
>                
>       MET
>    BRLS      METHOD
>             E1638
> PB           ND
>               0.008
> 2292       LHR020GW-01E2
>                
>              N
>            WG
>                
>       MI
> BRLS      NONE    A2320
>   ALK        =
>        807000
> 2293       LHR020GW-01E2
>                
>              N
>            WG
>                
>       MI
> BRLS      NONE    A2320
>   ALKB      =
>      807000
> 2294       LHR020GW-01E2
>                
>              N 
>            WG 
>                
>       MI         
> BRLS      NONE    A2320 
>   ALKC      ND     
>     2500
> 2295       LHR020GW-01E2 
>                
>              N 
>            WG 
>                
>       ORG   
>    BRLS      NONE   
> A5310B DOC       =   
>          49330
> 2296       LHR020GW-01E2 
>                
>              N 
>            WG 
>                
>       SN         
> BRLS      NONE    E300   
>    NO3       = 
>            792
> 2326       LHR020SD-00E2 
>            
>    N         
>    SE         
>              
>    MET   
>    BRLS      METHOD 
>             E1630   
> MEHG   =         
>    4.28
> 2327       LHR020SD-00E2 
>            
>    N         
>    SE         
>              
>    MI         
> BRLS      METHOD       
>       E160.3   SOLID 
>   =         
>    48.45
> 2328       LHR020SD-00E2 
>            
>    N         
>    SE         
>              
>    ORG   
>    BRLS      NONE   
> SW9060             
>   TOC        =     
>        4.823
> 2329       LHR020SD-00E2 MY77J8
> N         
>    SE         
>              
>    MET   
>    A4SW    METHOD   
>          
> C245.5   HG         
> =             5100
> 2330       LHR020SD-00E2 MY77J8
> N         
>    SE         
>              
>    MET   
>    A4SW    METHOD   
>          
> E200.8   AG         
> ND          1050
> 2331       LHR020SD-00E2 MY77J8
> N         
>    SE         
>              
>    MET   
>    A4SW    METHOD   
>          
> E200.8   AS       
>    =         
>    5500
> 2332       LHR020SD-00E2 MY77J8
> N         
>    SE         
>              
>    MET   
>    A4SW    METHOD   
>          
> E200.8   B         
>    =         
>    11400
> 2346       LHR020SD-00E2 MY77J8
> N         
>    SE         
>              
>    MET   
>    A4SW    SW3050B   
>          SW6010B   
>          V     
>        =       
>         56900
> 2349       LHR020SD-00E2 MY77J8
> N         
>    SE         
>              
>    MI         
> A4SW    METHOD         
>     A2540G         
>       SOLID    =   
>          47.7
> 
> Desired output:
>                
> fldsampid           
> CLP_ID  sacode 
> matrix   etc...      CL 
>          PO4   
>     SO4        AG   
>       AL       
>    AS       
>    B         
>    V         
>    Zn         
>       etc...      ALK 
>       ALKB      ALKC 
>     SOLID    DOC   
>    TOC        NO3
>                
> LHR020GW-01E2           
>                
>    N         
>    WG         
>              
> <value>           
>     <value>       
>         <value>   
>            
> <value>           
>     <value>       
>         <value>   
>            
> <value>           
>     <value>       
>         <value>   
>            
> <value>           
>     <value>       
>         <value>   
>            
> <value>           
>     <value>       
>         <value>   
>            
> <value>           
>     <value>
>                
> LHR020SD-00E2 MY77J8 N         
>    SE         
>              
>    NA         
> NA          NA     
>     <value>       
>         <value>   
>            
> <value>           
>     <value>       
>         <value>   
>             NA   
>       <value>     
>           NA     
>     NA          NA 
>         <value>   
>             NA   
>       NA         
> NA
> 
>     [[alternative HTML version deleted]]
> 
> ______________________________________________
> R-help at r-project.org
> mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained,
> reproducible code.
> 



More information about the R-help mailing list