[R] merging two dataframes

dividend stapnes at gmail.com
Wed Oct 26 12:59:02 CEST 2011


Hello.

Now i tried to do what you told me.
I used the str(fuction), and data$date1 and data3$date1 where both listed
"character". I changed "name" to character but it did not work either.
I also changed all variables to character, with no positive result.

str(data)
'data.frame':   14446 obs. of  15 variables:
 $ id     : chr  "1" "1" "1" "1" ...
 $ compid : chr  "2514" "2514" "2514" "2514" ...
 $ secid  : chr  "15856" "15856" "15856" "15856" ...
 $ name   : chr  "A-pressen" "A-pressen" "A-pressen" "A-pressen" ...
 $ period : chr  "1" "2" "3" "4" ...
 $ date   : chr  "17.05.1980" "17.05.1981" "17.05.1982" "17.05.1983" ...
 $ enddate: chr  "17.05.1981" "17.05.1982" "17.05.1983" "17.05.1984" ...
 $ div    : chr  NA NA NA NA ...
 $ ndivs  : chr  NA NA NA NA ...
 $ posdiv : chr  NA NA NA NA ...
 $ ddiv2  : chr  NA NA NA NA ...
 $ ddiv3  : chr  NA NA NA NA ...
 $ ddiv4  : chr  NA NA NA NA ...
 $ ddiv5  : chr  NA NA NA NA ...
 $ ddiv6  : chr  NA NA NA NA ...

str(data3)
'data.frame':   812354 obs. of  9 variables:
 $ date                  : chr  "02.01.1996" "03.01.1996" "04.01.1996"
"05.01.1996" ...
 $ Securityid            : chr  "6001" "6001" "6001" "6001" ...
 $ Symbol                : chr  "AAV" "AAV" "AAV" "AAV" ...
 $ name                  : chr  "Adresseavisen" "Adresseavisen"
"Adresseavisen" "Adresseavisen" ...
 $ Securitytype          : chr  "Ordinary Shares" "Ordinary Shares"
"Ordinary Shares" "Ordinary Shares" ...
 $ Unadjusted            : chr  "200" "200" "200" "200" ...
 $ Event.adjusted        : chr  "200" "200" "200" "200" ...
 $ Div.and.Event.adjusted: chr  "109,7595375" "109,7595375" "109,7595375"
"109,7595375" ...
 $ Sharesissued          : chr  "1901646" "1901646" "1901646" "1901646" ...

Here is some suitable data for "data"

> dput(data[1:20,])

structure(list(id = c("1", "1", "1", "1", "1", "1", "1", "1", 
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"), 
    compid = c("2514", "2514", "2514", "2514", "2514", "2514", 
    "2514", "2514", "2514", "2514", "2514", "2514", "2514", "2514", 
    "2514", "2514", "2514", "2514", "2514", "2514"), secid = c("15856", 
    "15856", "15856", "15856", "15856", "15856", "15856", "15856", 
    "15856", "15856", "15856", "15856", "15856", "15856", "15856", 
    "15856", "15856", "15856", "15856", "15856"), name = c("A-pressen", 
    "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen", 
    "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen", 
    "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen", 
    "A-pressen", "A-pressen", "A-pressen", "A-pressen"), period = c("1", 
    "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", 
    "13", "14", "15", "16", "17", "18", "19", "20"), date = c("17.05.1980", 
    "17.05.1981", "17.05.1982", "17.05.1983", "17.05.1984", "17.05.1985", 
    "17.05.1986", "17.05.1987", "17.05.1988", "17.05.1989", "17.05.1990", 
    "17.05.1991", "17.05.1992", "17.05.1993", "17.05.1994", "17.05.1995", 
    "17.05.1996", "17.05.1997", "17.05.1998", "17.05.1999"), 
    enddate = c("17.05.1981", "17.05.1982", "17.05.1983", "17.05.1984", 
    "17.05.1985", "17.05.1986", "17.05.1987", "17.05.1988", "17.05.1989", 
    "17.05.1990", "17.05.1991", "17.05.1992", "17.05.1993", "17.05.1994", 
    "17.05.1995", "17.05.1996", "17.05.1997", "17.05.1998", "17.05.1999", 
    "17.05.2000"), div = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    "0", "0", "0", "0", "0", "5", "0", "1.1", "1.2", "1", "0"
    ), ndivs = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, "0", "0", 
    "0", "0", "0", "1", "0", "1", "1", "1", "0"), posdiv = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "1", 
    NA, "1", "1", "1", NA), ddiv2 = c(NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, "0", "0", "0", "0", "0", NA, "0", "1", NA, 
    NA), ddiv3 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    "0", "0", "0", "0", "0", "0", "0", "0", "-1"), ddiv4 = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "0", "0", "0", 
    "0", "0", "0", "0", "0"), ddiv5 = c(NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, "0", "0", "0", "0", "0", "0", 
    "0"), ddiv6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, "0", "0", "0", "0", "0", "0")), .Names = c("id", 
"compid", "secid", "name", "period", "date", "enddate", "div", 
"ndivs", "posdiv", "ddiv2", "ddiv3", "ddiv4", "ddiv5", "ddiv6"
), row.names = c(NA, 20L), class = "data.frame")




Here is some suitable data for "data3":

> dput(data3[1:20,])

structure(list(date = c("02.01.1996", "03.01.1996", "04.01.1996", 
"05.01.1996", "08.01.1996", "09.01.1996", "10.01.1996", "11.01.1996", 
"12.01.1996", "15.01.1996", "16.01.1996", "17.01.1996", "18.01.1996", 
"19.01.1996", "22.01.1996", "23.01.1996", "24.01.1996", "25.01.1996", 
"26.01.1996", "29.01.1996"), Securityid = c("6001", "6001", "6001", 
"6001", "6001", "6001", "6001", "6001", "6001", "6001", "6001", 
"6001", "6001", "6001", "6001", "6001", "6001", "6001", "6001", 
"6001"), Symbol = c("AAV", "AAV", "AAV", "AAV", "AAV", "AAV", 
"AAV", "AAV", "AAV", "AAV", "AAV", "AAV", "AAV", "AAV", "AAV", 
"AAV", "AAV", "AAV", "AAV", "AAV"), name = c("Adresseavisen", 
"Adresseavisen", "Adresseavisen", "Adresseavisen", "Adresseavisen", 
"Adresseavisen", "Adresseavisen", "Adresseavisen", "Adresseavisen", 
"Adresseavisen", "Adresseavisen", "Adresseavisen", "Adresseavisen", 
"Adresseavisen", "Adresseavisen", "Adresseavisen", "Adresseavisen", 
"Adresseavisen", "Adresseavisen", "Adresseavisen"), Securitytype =
c("Ordinary Shares", 
"Ordinary Shares", "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
"Ordinary Shares", "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
"Ordinary Shares", "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
"Ordinary Shares", "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
"Ordinary Shares", "Ordinary Shares", "Ordinary Shares"), Unadjusted =
c("200", 
"200", "200", "200", "200", "200", "200", "200", "200", "200", 
"200", "200", "200", "200", "200", "200", "200", "200", "200", 
"200"), Event.adjusted = c("200", "200", "200", "200", "200", 
"200", "200", "200", "200", "200", "200", "200", "200", "200", 
"200", "200", "200", "200", "200", "200"), Div.and.Event.adjusted =
c("109,7595375", 
"109,7595375", "109,7595375", "109,7595375", "109,7595375", "109,7595375", 
"109,7595375", "109,7595375", "109,7595375", "109,7595375", "109,7595375", 
"109,7595375", "109,7595375", "109,7595375", "109,7595375", "109,7595375", 
"109,7595375", "109,7595375", "109,7595375", "109,7595375"), 
    Sharesissued = c("1901646", "1901646", "1901646", "1901646", 
    "1901646", "1901646", "1901646", "1901646", "1901646", "1901646", 
    "1901646", "1901646", "1901646", "1901646", "1901646", "1901646", 
    "1901646", "1901646", "1901646", "1901646")), .Names = c("date", 
"Securityid", "Symbol", "name", "Securitytype", "Unadjusted", 
"Event.adjusted", "Div.and.Event.adjusted", "Sharesissued"), row.names =
c(NA, 
20L), class = "data.frame")



When I run the function:
 
data4<-merge(data,data3, by=c("name","date1"), all=T)

> dput(data4[1:20,])

structure(list(name = c("A-pressen", "A-pressen", "A-pressen", 
"A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen", 
"A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen", 
"A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen", 
"A-pressen", "A-pressen"), date = c("01.02.1999", "01.02.2000", 
"01.02.2001", "01.02.2002", "01.03.1999", "01.03.2000", "01.03.2001", 
"01.03.2002", "01.04.2003", "01.06.1999", "01.06.2001", "01.07.1999", 
"01.07.2002", "01.07.2003", "01.08.2000", "01.08.2001", "01.08.2002", 
"01.08.2003", "01.09.1999", "01.09.2000"), id = c(NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_), 
    compid = c(NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_
    ), secid = c(NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_), period = c(NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_), enddate = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), div = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), ndivs = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), posdiv = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), ddiv2 = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), ddiv3 = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), ddiv4 = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), ddiv5 = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), ddiv6 = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), Securityid = c("15856", 
    "15856", "15856", "15856", "15856", "15856", "15856", "15856", 
    "15856", "15856", "15856", "15856", "15856", "15856", "15856", 
    "15856", "15856", "15856", "15856", "15856"), Symbol = c("APR", 
    "APR", "APR", "APR", "APR", "APR", "APR", "APR", "APR", "APR", 
    "APR", "APR", "APR", "APR", "APR", "APR", "APR", "APR", "APR", 
    "APR"), Securitytype = c("Ordinary Shares", "Ordinary Shares", 
    "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
    "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
    "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
    "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
    "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
    "Ordinary Shares", "Ordinary Shares", "Ordinary Shares"), 
    Unadjusted = c("120", "140", "160", "105", "110", "190", 
    "160", "112", "115", "120", "150", "127", "106,5", "154", 
    "155", "160", "111", "155", "127", "205"), Event.adjusted = c("120", 
    "140", "160", "105", "110", "190", "160", "112", "115", "120", 
    "150", "127", "106,5", "154", "155", "160", "111", "155", 
    "127", "205"), Div.and.Event.adjusted = c("111,4092308", 
    "129,9774359", "148,545641", "100,1538462", "102,1251282", 
    "176,3979487", "148,545641", "106,8307692", "109,6923077", 
    "111,4092308", "143,0769231", "117,9081026", "101,5846154", 
    "154", "143,9035897", "152,6153846", "105,8769231", "155", 
    "117,9081026", "190,3241026"), Sharesissued = c("8839643", 
    "8839643", "8854307", "8866191", "8839643", "8839643", "8854307", 
    "8867791", "8885537", "8839643", "8866191", "8839643", "8885537", 
    "8903842", "8854307", "8866191", "8885537", "8903842", "8839643", 
    "8854307")), .Names = c("name", "date", "id", "compid", "secid", 
"period", "enddate", "div", "ndivs", "posdiv", "ddiv2", "ddiv3", 
"ddiv4", "ddiv5", "ddiv6", "Securityid", "Symbol", "Securitytype", 
"Unadjusted", "Event.adjusted", "Div.and.Event.adjusted", "Sharesissued"
), row.names = c(NA, 20L), class = "data.frame")


If I want to just keep the observations from "data" (14446 observations)
merged with "data3". Is it correct to use:
data4<-merge(data,data3, by=c("name","date1"), all=T)
or should I use:
data4<-merge(data,data3, by=c("name","date1"), all.x=F)? 
(Then I get ca 14000 obs, but "NA" in all variables from "data3".

In advance, thank you.



--
View this message in context: http://r.789695.n4.nabble.com/merging-two-dataframes-tp3932869p3940157.html
Sent from the R help mailing list archive at Nabble.com.



More information about the R-help mailing list