[R] Help on reducing multiple loops

John Kane jrkrideau at yahoo.ca
Thu May 18 11:55:39 CEST 2017


Data? It's difficult to do anything without some test data.See How to make a great R reproducible example? or http://adv-r.had.co.nz/Reproducibility.html 
 with particular reference to the use of dput() as the best way to provide sample data.

  
|  
|   
|   
|   |    |

   |

  |
|  
|   |  
How to make a great R reproducible example?
 When discussing performance with colleagues, teaching, sending a bug report or searching for guidance on mailing...  |   |

  |

  |

 
 

    On Wednesday, May 17, 2017 6:10 PM, Sumanta Basak <sumanta24 at gmail.com> wrote:
 

 Hi All,

I've a data-set on product sub-product matrix on which I'm doing multiple
calculation, but unfortunately using nested loops, the programme is taking
long time to execute. Can anyone help me how to get rid of the following
jungle? Any direction would be helpful.

GA <- "India"
verticle <- "Prod1"

prod_data <- readRDS(paste0("/Prod_ladder_",GA,"_",verticle,".rds"))
setDF(prod_data)

Final_data <-
subset(prod_data[,c("P_KEY","Active_Prod_Id","Active_Prod_Nm")],!duplicated(prod_data[,c("P_KEY","Active_Prod_Id")]))

proximity_prod_mapping <- readRDS("Proximity_prod_mapping.rds")
dst_prod <- subset(prod_data[,c("P_KEY")],!duplicated(prod_data$P_KEY))


output_data <- c()
data_merge_final <- c()

system.time({
  for(i in 1 : length(dst_prod)){

    prod_data <- subset(prod_data,prod_data$P_KEY == dst_prod[i]) #
Subsetting data at prod level
    dst_prod <-
subset(prod_data[,c("Active_Prod_Id")],!duplicated(prod_data$Active_Prod_Id))
# Finding distinct prods of active prodloyee

    for(j in 1 : length(dst_prod)){
      # Subsetting data at prod level for active prod
      # Fetiching data for Anchor prod
      prod_data1 <-
subset(prod_data[,c("P_KEY","Active_Prod_Id","Active_Prod_Nm","Start_Date_1","End_Date_1")],prod_data$Active_Prod_Id
== dst_prod[j])
      prod_data1$Anchor_prod <- 1
      anc_max_End_Date_1 <- as.Date(max(prod_data1$End_Date_1),origin =
"1970-01-01")
      anc_prod_count <- sum(prod_data1$Anchor_prod)

      # Fetiching data for Proximate prod
      prox_prod_data <-
subset(proximity_prod_mapping[,c("Proximate_prod_ID")],proximity_prod_mapping$Anchor_prod_ID
== dst_prod[j])
      prod_data2 <-
subset(prod_data[,c("P_KEY","Active_Prod_Id","Active_Prod_Nm","Start_Date_1","End_Date_1")],prod_data$Active_Prod_Id
%in% c(prox_prod_data))
      prox_sill_count <- 0
      if(nrow(prod_data2) > 0){
        prod_data2$Proximity_prod <- 1
        prox_max_End_Date_1 <- as.Date(max(prod_data2$End_Date_1),origin =
"1970-01-01")
        prox_sill_count <- sum(prod_data2$Proximity_prod)
      }
      # library(plyr)
      prod_data <-rbind.fill(prod_data1,prod_data2)
      prod_data$exclude <- 0
      prod_data$Anchor_Active_Prod_Id <- dst_prod[j]

      prod_data$Start_Date_1 <- as.Date(prod_data$Start_Date_1,origin =
"1970-01-01")
      prod_data$End_Date_1 <- as.Date(prod_data$End_Date_1,origin =
"1970-01-01")

      if(prox_sill_count > 0){
        if(nrow(prod_data) > 1){
          # Trimming end date of proximity prods where end data of
proximity prod is greater that Anchor prod
          if((prox_max_End_Date_1 - anc_max_End_Date_1) > 0){
            prod_data$End_Date_1 <- ifelse(prod_data$Proximity_prod == 1 &
(prod_data$End_Date_1 - anc_max_End_Date_1) > 0,
anc_max_End_Date_1,prod_data$End_Date_1)
            prod_data$End_Date_1 <- as.Date(prod_data$End_Date_1,origin =
"1970-01-01")
          }
          prod_data$exclude <- ifelse(prod_data$Proximity_prod == 1 &
(as.Date(prod_data$Start_Date_1,origin = "1970-01-01") -
anc_max_End_Date_1) > 0,1,0)
          prod_data <- subset(prod_data,prod_data$exclude == 0)

          prod_data <-
arrange(prod_data,prod_data$Anchor_prod,desc(prod_data$End_Date_1),prod_data$Start_Date_1)

          prod_data$Anchor_prod <- ifelse(is.na
(prod_data$Anchor_prod),0,prod_data$Anchor_prod)
          prod_data$Proximity_prod <- ifelse(is.na
(prod_data$Proximity_prod),0,prod_data$Proximity_prod)
          prod_data$new_rec <- 0

          tot_loop <- nrow(prod_data)
          k=1
          # Looping to map start date and end date of each row with other
rows
          while(k <= tot_loop){
            excl_flag <- prod_data[k,c("exclude")]
            if(excl_flag == 0){
              st_dt1 <- as.Date(prod_data[k,c("Start_Date_1")])
              end_dt1 <- as.Date(prod_data[k,c("End_Date_1")])
              prod_flag1 <- prod_data[k,c("Anchor_prod")]

              if(k != nrow(prod_data)){
                tot_row <- nrow(prod_data)

                for(m in 1 : (tot_row -k)){
                  l = k+m
                  if(l != k){
                    st_dt2 <- as.Date(prod_data[l,c("Start_Date_1")])
                    end_dt2 <- as.Date(prod_data[l,c("End_Date_1")])
                    prod_flag2 <- prod_data[l,c("Anchor_prod")]

                    flag_excl <- prod_data[l,c("exclude")]
                    if(flag_excl ==0){
                      rec_check <- prod_data[l,c("new_rec")]
                      # if(rec_check == 0){
                      prod_data$Start_date2 <- NA
                      prod_data$End_date2 <- NA

                      new_start_date <- as.Date(ifelse(prod_flag1 == 1 &
prod_flag2 == 1,NA,
                                                      ifelse(prod_flag1 ==
1 & prod_flag2 == 0 & end_dt2 > end_dt1 & st_dt2 < end_dt1,end_dt1,

ifelse(prod_flag1 == 0 & prod_flag2 == 1 & end_dt1 > end_dt2 & st_dt1 <
end_dt2,end_dt2,NA))),origin = "1970-01-01")
                      message(paste0("new_start_date = ",new_start_date))
                      new_start_date <- as.Date(new_start_date,origin =
"1970-01-01")
                      message(paste0("new_start_date = ",new_start_date))
                      new_end_date <- as.Date(ifelse(prod_flag1 == 1 &
prod_flag2 == 1,NA,
                                                    ifelse(prod_flag1 == 1
& prod_flag2 == 0 & end_dt2 > end_dt1 & st_dt2 < end_dt1,end_dt2,

ifelse(prod_flag1 == 0 & prod_flag2 == 1 & end_dt1 > end_dt2 & st_dt1 <
end_dt2,end_dt1,NA))),origin = "1970-01-01")
                      message(paste0("new_end_date = ",new_end_date))
                      new_end_date <- as.Date(new_end_date,origin =
"1970-01-01")
                      message(paste0("new_end_date = ",new_end_date))
                      prod_data[l,c("Start_date2")] <-
as.Date(new_start_date,origin = "1970-01-01")
                      prod_data[l,c("End_date2")] <-
as.Date(new_end_date,origin = "1970-01-01")

                      tmp_data <- subset(prod_data,!is.na
(prod_data$Start_date2))
                      tmp_data$Start_Date_1 <-
as.Date(tmp_data$Start_date2,origin = "1970-01-01")
                      tmp_data$End_Date_1 <-
as.Date(tmp_data$End_date2,origin = "1970-01-01")
                      if(nrow(tmp_data)){
                        tmp_data$new_rec <- 1
                        prod_data[l,c("End_Date_1")] <-
as.Date(end_dt1,origin = "1970-01-01")
                      }
                      prod_data <- rbind(prod_data,tmp_data)
                      tot_row <- tot_row + nrow(tmp_data)
                      tot_loop <- tot_loop + nrow(tmp_data)
                      prod_data$Start_date2 <- NULL
                      prod_data$End_date2 <- NULL
                      # }
                    }
                    # Condition to identify true subset

                    # overlap <- ifelse((st_dt1 >= st_dt2 & st_dt1 <=
end_dt2) & (end_dt1 >= st_dt2 & end_dt1 <= end_dt2),1,
                    #            ifelse((st_dt2 >= st_dt1 & st_dt2 <=
end_dt1) & (end_dt2 >= st_dt1 & end_dt2 <= end_dt1),1,0))



                    if((end_dt1 - st_dt2) >= 0){
                      if((end_dt2 - st_dt1) >= 0){
                        if((st_dt2 - st_dt1) >=0){
                          prod_data[k,c("exclude")] <- ifelse(prod_flag1 ==
1 & prod_flag2 == 1,9999, #if Anchor prods have overlapping

 ifelse(prod_flag1 == 1 & prod_flag2 == 0,0,

ifelse(prod_flag1 == 0 & prod_flag2 == 1,1,

 ifelse(prod_flag1 == 0 & prod_flag2 == 0,0,1))))
                          prod_data[l,c("exclude")] <- ifelse(prod_flag1 ==
1 & prod_flag2 == 1,9999,

 ifelse(prod_flag1 == 0 & prod_flag2 == 1,0,

ifelse(prod_flag1 == 1 & prod_flag2 == 0,1,

 ifelse(prod_flag1 == 0 & prod_flag2 == 0,1,0))))
                        }
                      }
                    }
                    # Condition to trim the dates as to make dates in each
observation mutually exclusive to exch other
                    flag_excl <- prod_data[l,c("exclude")]
                    if(flag_excl == 0){
                      if(end_dt1 > st_dt2){
                        if(st_dt1 >= st_dt2){
                          new_date <- ifelse(end_dt2 >
st_dt1,as.Date(st_dt1,origin = "1970-01-01"),as.Date(end_dt2,origin =
"1970-01-01"))
                          new_date <- as.Date(new_date,origin =
"1970-01-01")
                          old_date <-
as.Date(prod_data[l,c("End_Date_1")],origin = "1970-01-01")
                          old_date <- as.Date(old_date,origin =
"1970-01-01")

                          # prod_data[j,c("End_Date_1")] <-
ifelse(prod_flag1 == 1 & prod_flag2 == 1,as.Date(old_date,origin =
"1970-01-01"),
                          #
ifelse(prod_flag1 == 0 & prod_flag2 == 1,as.date(old_date, origin =
"1970-01-01"),as.Date(new_date,origin = "1970-01-01")))

                          prod_data[l,c("End_Date_1")] <-
as.Date(ifelse(prod_flag1 == 1 & prod_flag2 == 1,old_date,ifelse(prod_flag1
== 0 & prod_flag2 == 1,old_date,new_date)),origin = "1970-01-01")

                        }
                      }
                    }
                  }
                }

              }
            }
            k=k+1
          }
        }
      }
      # excluding non required observations
      prod_data <- subset(prod_data,prod_data$exclude == 0)

      prod_data$multiply_factor <- ifelse(prod_data$Anchor_prod == 1,1,
                                          ifelse(prod_data$Proximity_prod
== 1,0.5,9999))

      prod_data$recency_in_months <- (as.Date("2017-01-31") -
prod_data$End_Date_1)/30

      prod_data$recency_factor <- ifelse(prod_data$recency_in_months <=
12,1,

ifelse(prod_data$recency_in_months > 12 & prod_data$recency_in_months <=
24,0.9,

 ifelse(prod_data$recency_in_months > 24 & prod_data$recency_in_months <=
36,0.8,

ifelse(prod_data$recency_in_months > 36 & prod_data$recency_in_months <=
48,0.7,

 ifelse(prod_data$recency_in_months > 48,0.6,9999)))))

      prod_data$duration_in_months <- (prod_data$End_Date_1 -
prod_data$Start_Date_1)/30

      prod_data$weight <-
prod_data$duration_in_months*prod_data$multiply_factor*prod_data$recency_factor

      prod <- prod_data[1,c("Anchor_Active_Prod_Id")]
      if(nrow(prod_data) > 1){
        data_merge <-with(prod_data,aggregate(weight ~ P_KEY, FUN =
function(x) c(Proficiency_Score = sum(x))))
      }else{
        data_merge <- prod_data[1,c("P_KEY","weight")]
      }


      data_merge$prod <- prod_data[1,c("Anchor_Active_Prod_Id")]

      data_merge_final <- rbind(data_merge_final,data_merge)


      # Recency and Duration calculation goes here and final score will be
added in final data
      output_data <- rbind.fill(output_data,prod_data)

    }
  }


  Final_data <- merge(Final_data,data_merge_final,by.x= c("P_KEY",
"Active_Prod_Id"),by.y = c("P_KEY", "prod"),all.x=TRUE)

  names(Final_data)[names(Final_data) == "weight"] <- "Proficiency_Score"

  emerging_prod_mapping <- readRDS("5.Emerging_prod_Lookup.rds")

  emerging_prod_list <-
subset(emerging_prod_mapping[,c("prod_ID")],!duplicated(emerging_prod_mapping$prod_ID))


  Final_data$Emerging_Traditional <- ifelse(Final_data$Active_Prod_Id %in%
c(emerging_prod_list),"Emerging","Traditional")

  Final_data$Final_Proficiency <- ifelse(Final_data$Emerging_Traditional ==
"Traditional",

 ifelse(Final_data$Proficiency_Score < 12, "P0",

ifelse(Final_data$Proficiency_Score >=12 & Final_data$Proficiency_Score <
24,"P1",

 ifelse(Final_data$Proficiency_Score >=24 & Final_data$Proficiency_Score <
48,"P2",

ifelse(Final_data$Proficiency_Score >=48 & Final_data$Proficiency_Score <
60,"P3",

 ifelse(Final_data$Proficiency_Score >=60,"P4",NA))))),

 ifelse(Final_data$Emerging_Traditional == "Emerging",

ifelse(Final_data$Proficiency_Score < 6, "P0",

 ifelse(Final_data$Proficiency_Score >=6 & Final_data$Proficiency_Score <
12,"P1",

ifelse(Final_data$Proficiency_Score >=12 & Final_data$Proficiency_Score <
24,"P2",

 ifelse(Final_data$Proficiency_Score >=24 & Final_data$Proficiency_Score <
30,"P3",

ifelse(Final_data$Proficiency_Score >=30,"P4",NA))))),NA))

  tst <- prod_data[,c("P_KEY", "Id")]
  tst <- subset(tst,!duplicated(tst))

  Final_data <-
merge(Final_data,tst[,c("P_KEY","Id")],by="P_KEY",all.x=TRUE)
})

*SUMANTA BASAK*

    [[alternative HTML version deleted]]

______________________________________________
R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.


   
	[[alternative HTML version deleted]]



More information about the R-help mailing list