[R] extracting a table from pdf file....

akshay kulkarni @k@h@y_e4 @end|ng |rom hotm@||@com
Sun Jan 8 09:59:58 CET 2023


dear members,
                            I am extracting a pdf table into a data frame from this URL:

https://www.canmoney.in/pdf/INTRADAYLEVERAGE-20220531-latest.pdf

I am using extract_table() from the tabulizer package (it is archived and have installed it from github)

IDTpdf <- extract_tables("https://www.canmoney.in/pdf/INTRADAYLEVERAGE-20220531-latest.pdf",output="data.frame")

But IDTpdf consists of four different dfs, and I want to collapse them into one. The dput of IDTpdf:

list(structure(list(SCRIPS.AVAILABLE.FOR.INTRADAY.WITH.LEVERAGES.PROVIDED.ON.THEM = c("S.No.",
"times)", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
"22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32",
"33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43",
"44", "45", "46", "47", "48", "49", "50", "51", "52", "53"),
    X = c("Scrip Name", "", "ALKEM LABORATORIES LTD.", "ATUL LTD",
    "ABB INDIA LIMITED", "AARTI INDUSTRIES LTD", "ABBOTT INDIA LIMITED",
    "ADITYA BIRLA CAPITAL LTD.", "ADITYA BIRLA FASHION & RT",
    "ACC LIMITED", "ADANI ENTERPRISES LIMITED", "ADANI PORT & SEZ LTD",
    "AMARA RAJA BATTERIES LTD.", "ASTRAL LIMITED", "AMBUJA CEMENTS LTD",
    "ALEMBIC PHARMA LTD", "APOLLO HOSPITALS ENTER. L", "APOLLO TYRES LTD",
    "ASHOK LEYLAND LTD", "ASIAN PAINTS LIMITED", "AU SMALL FINANCE BANK LTD",
    "AUROBINDO PHARMA LTD", "AXIS BANK LIMITED", "BAJAJ AUTO LIMITED",
    "BAJAJ FINSERV LTD.", "BAJAJ FINANCE LIMITED", "BALRAMPUR CHINI MILLS LTD",
    "BANDHAN BANK LIMITED", "BANK OF BARODA", "BATA INDIA LTD",
    "BHARAT ELECTRONICS LTD", "BERGER PAINTS (I) LTD", "BHARTI AIRTEL LIMITED",
    "BHEL", "BIOCON LIMITED.", "BOSCH LIMITED", "BRITANNIA INDUSTRIES LTD",
    "BIRLASOFT LIMITED", "ZYDUS LIFESCIENCES LTD", "CANARA BANK",
    "CAN FIN HOMES LTD", "CHAMBAL FERTILIZERS LTD", "CHOLAMANDALAM IN & FIN CO",
    "CIPLA LTD", "COAL INDIA LTD", "COFORGE LIMITED", "COLGATE PALMOLIVE LTD.",
    "CONTAINER CORP OF IND LTD", "COROMANDEL INTERNTL. LTD",
    "CROMPT GREA CON ELEC LTD", "CITY UNION BANK LTD", "CUMMINS INDIA LTD",
    "DABUR INDIA LTD", "DEEPAK NITRITE LTD", "DELTA CORP LIMITED"
    ), X.1 = c("Symbol Series", "", "ALKEM", "ATUL", "ABB", "AARTIIND",
    "ABBOTINDIA", "ABCAPITAL", "ABFRL", "ACC", "ADANIENT", "ADANIPORTS",
    "AMARAJABAT", "ASTRAL", "AMBUJACEM", "APLLTD", "APOLLOHOSP",
    "APOLLOTYRE", "ASHOKLEY", "ASIANPAINT", "AUBANK", "AUROPHARMA",
    "AXISBANK", "BAJAJ-AUTO", "BAJAJFINSV", "BAJFINANCE", "BALRAMCHIN",
    "BANDHANBNK", "BANKBARODA", "BATAINDIA", "BEL", "BERGEPAINT",
    "BHARTIARTL", "BHEL", "BIOCON", "BOSCHLTD", "BRITANNIA",
    "BSOFT", "ZYDUSLIFE", "CANBK", "CANFINHOME", "CHAMBLFERT",
    "CHOLAFIN", "CIPLA", "COALINDIA", "COFORGE", "COLPAL", "CONCOR",
    "COROMANDEL", "CROMPTON", "CUB", "CUMMINSIND", "DABUR", "DEEPAKNTR",
    "DELTACORP"), X.2 = c("Leverage (in", "", "4.5", "4.5", "4.5",
    "4.5", "4.5", "4.5", "4.5", "4.5", "4", "4.5", "4.5", "4.5",
    "4.5", "4.5", "4.5", "4.5", "4.5", "4.5", "4.5", "4.5", "4.5",
    "4.5", "4.5", "4.5", "4", "4", "4.5", "4.5", "4.5", "4.5",
    "4.5", "4", "4.5", "4.5", "4.5", "4", "4.5", "4", "4.5",
    "4", "4", "4.5", "4.5", "4", "4.5", "4.5", "4.5", "4.5",
    "4.5", "4.5", "4.5", "4.5", "4")), class = "data.frame", row.names = c(NA,
-55L)), structure(list(X54 = 55:110, DIVI.S.LABORATORIES.LTD = c("DIXON TECHNO (INDIA) LTD",
"DLF LIMITED", "DR. REDDY S LABORATORIES", "ESCORTS INDIA LTD",
"EXIDE INDUSTRIES LTD", "FEDERAL BANK LTD", "FIRSTSOURCE SOLU. LTD.",
"GAIL (INDIA) LTD", "GLENMARK PHARMACEUTICALS", "GMR INFRASTRUCTURE LTD.",
"GUJ NAR VAL FER & CHEM L", "DALMIA BHARAT LIMITED", "GODREJ CONSUMER PRODUCTS",
"GRANULES INDIA LIMITED", "GRASIM INDUSTRIES LTD", "GUJARAT STATE PETRO LTD",
"GUJARAT GAS LIMITED", "HINDUSTAN AERONAUTICS LTD", "HAVELLS INDIA LIMITED",
"HCL TECHNOLOGIES LTD", "HDFC LTD", "HDFC AMC LIMITED", "HDFC BANK LTD",
"HDFC LIFE INS CO LTD", "HERO MOTOCORP LIMITED", "HINDALCO INDUSTRIES LTD",
"HINDUSTAN COPPER LTD", "HONEYWELL AUTOMATION IND", "ICICI BANK LTD.",
"ICICI LOMBARD GIC LIMITED", "ICICI PRU LIFE INS CO LTD", "IDFC LIMITED",
"IDFC FIRST BANK LIMITED", "INDIAN ENERGY EXC LTD", "INDRAPRASTHA GAS LTD",
"THE INDIAN HOTELS CO. LTD", "THE INDIA CEMENTS LIMITED", "INDIAMART INTERMESH LTD",
"INTERGLOBE AVIATION LTD", "INDUSIND BANK LIMITED", "INDUS TOWERS LIMITED",
"INFOSYS LIMITED", "INTELLECT DESIGN ARENA", "INDIAN OIL CORP LTD",
"IPCA LABORATORIES LTD", "INDIAN RAIL TOUR CORP LTD", "ITC LTD",
"JINDAL STEEL & POWER LTD", "JSW STEEL LIMITED", "JUBILANT FOODWORKS LTD",
"KOTAK MAHINDRA BANK LTD", "L&T FINANCE HOLDINGS LTD", "DR. LAL PATH LABS LTD.",
"LAURUS LABS LIMITED", "LIC HOUSING FINANCE LTD", "LARSEN & TOUBRO LTD."
), DIVISLAB = c("DIXON", "DLF", "DRREDDY", "ESCORTS", "EXIDEIND",
"FEDERALBNK", "FSL", "GAIL", "GLENMARK", "GMRINFRA", "GNFC",
"DALBHARAT", "GODREJCP", "GRANULES", "GRASIM", "GSPL", "GUJGASLTD",
"HAL", "HAVELLS", "HCLTECH", "HDFC", "HDFCAMC", "HDFCBANK", "HDFCLIFE",
"HEROMOTOCO", "HINDALCO", "HINDCOPPER", "HONAUT", "ICICIBANK",
"ICICIGI", "ICICIPRULI", "IDFC", "IDFCFIRSTB", "IEX", "IGL",
"INDHOTEL", "INDIACEM", "INDIAMART", "INDIGO", "INDUSINDBK",
"INDUSTOWER", "INFY", "INTELLECT", "IOC", "IPCALAB", "IRCTC",
"ITC", "JINDALSTEL", "JSWSTEEL", "JUBLFOOD", "KOTAKBANK", "L&TFH",
"LALPATHLAB", "LAURUSLABS", "LICHSGFIN", "LT"), X4.5 = c(4.5,
4, 4.5, 4.5, 4.5, 4.5, 4, 4.5, 4.5, 4.5, 4, 4.5, 4.5, 4.5, 4.5,
4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4, 4.5,
4.5, 4.5, 4.5, 4, 4.5, 4.5, 4.5, 4.5, 4, 4, 4.5, 4, 4, 4.5, 4,
4.5, 4.5, 4, 4.5, 4, 4.5, 4.5, 4.5, 4, 4.5, 4.5, 4.5, 4.5)), class = "data.frame", row.names = c(NA,
-56L)), structure(list(X111 = 112:167, L.T.INFOTECH.LIMITED = c("L&T TECHNOLOGY SER. LTD.",
"LUPIN LIMITED", "MAHINDRA & MAHINDRA LTD", "M&M FIN. SERVICES LTD",
"MANAPPURAM FINANCE LTD", "MARICO LIMITED", "MARUTI SUZUKI INDIA LTD.",
"UNITED SPIRITS LIMITED", "MULTI COMMODITY EXCHANGE", "MAX FINANCIAL SERV LTD",
"MAHANAGAR GAS LTD.", "MINDTREE LIMITED", "MOTHERSON SUMI SYSTEMS LT",
"MPHASIS LIMITED", "MRF LTD", "MUTHOOT FINANCE LIMITED", "NATIONAL ALUMINIUM CO LTD",
"INFO EDGE (I) LTD", "NAVIN FLUORINE INT. LTD", "NBCC (INDIA) LIMITED",
"NMDC LTD.", "NTPC LTD", "OBEROI REALTY LIMITED", "ORACLE FIN SERV SOFT LTD.",
"OIL AND NATURAL GAS CORP.", "PAGE INDUSTRIES LTD", "PIRAMAL ENTERPRISES LTD",
"PERSISTENT SYSTEMS LTD", "PETRONET LNG LIMITED", "POWER FIN CORP LTD.",
"PIDILITE INDUSTRIES LTD", "PI INDUSTRIES LTD", "PUNJAB NATIONAL BANK",
"POLYCAB INDIA LIMITED", "POWER GRID CORP. LTD.", "PVR LIMITED",
"RAIN INDUSTRIES LIMITED", "THE RAMCO CEMENTS LIMITED", "RBL BANK LIMITED",
"REC LIMITED", "RELIANCE INDUSTRIES LTD", "STEEL AUTHORITY OF INDIA",
"SBI CARDS & PAY SER LTD", "SBI LIFE INSURANCE CO LTD", "STATE BANK OF INDIA",
"SHREE CEMENT LIMITED", "SIEMENS LTD", "SRF LTD", "SHRIRAM TRANSPORT FIN CO.",
"STRIDES PHARMA SCI LTD", "SUN PHARMACEUTICAL IND L", "SUN TV NETWORK LIMITED",
"SYNGENE INTERNATIONAL LTD", "TATA CHEMICALS LTD", "TATA COMMUNICATIONS LTD",
"TATA CONSUMER PRODUCT LTD"), LTI = c("LTTS", "LUPIN", "M&M",
"M&MFIN", "MANAPPURAM", "MARICO", "MARUTI", "MCDOWELL-N", "MCX",
"MFSL", "MGL", "MINDTREE", "MOTHERSUMI", "MPHASIS", "MRF", "MUTHOOTFIN",
"NATIONALUM", "NAUKRI", "NAVINFLUOR", "NBCC", "NMDC", "NTPC",
"OBEROIRLTY", "OFSS", "ONGC", "PAGEIND", "PEL", "PERSISTENT",
"PETRONET", "PFC", "PIDILITIND", "PIIND", "PNB", "POLYCAB", "POWERGRID",
"PVR", "RAIN", "RAMCOCEM", "RBLBANK", "RECLTD", "RELIANCE", "SAIL",
"SBICARD", "SBILIFE", "SBIN", "SHREECEM", "SIEMENS", "SRF", "SRTRANSFIN",
"STAR", "SUNPHARMA", "SUNTV", "SYNGENE", "TATACHEM", "TATACOMM",
"TATACONSUM"), X4.5 = c(4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5,
4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4, 4.5,
4.5, 4.5, 4.5, 4.5, 4.5, 4, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5,
4.5, 4.5, 4, 4.5, 4, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5,
4.5, 4, 4, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5)), class = "data.frame", row.names = c(NA,
-56L)), structure(list(X168 = 169:198, TATA.MOTORS.LIMITED = c("TATA POWER CO LTD",
"TATA STEEL LIMITED", "TATA CONSULTANCY SERV LT", "TECH MAHINDRA LIMITED",
"TITAN COMPANY LIMITED", "TORRENT PHARMACEUTICALS L", "TORRENT POWER LTD",
"TRENT LTD", "TVS MOTOR COMPANY  LTD", "UNITED BREWERIES LTD",
"ULTRATECH CEMENT LIMITED", "UPL LIMITED", "VEDANTA LIMITED",
"VOLTAS LTD", "WHIRLPOOL OF INDIA LTD", "WIPRO LTD", "ZEE ENTERTAINMENT ENT LTD",
"BALKRISHNA IND. LTD", "BHARAT FORGE LTD", "BHARAT PETROLEUM CORP  LT",
"EICHER MOTORS LTD", "GODREJ PROPERTIES LTD", "HINDUSTAN PETROLEUM CORP",
"JK CEMENT LIMITED", "NESTLE INDIA LIMITED", "METROPOLIS HEALTHCARE LTD",
"HINDUSTAN UNILEVER LTD.", "VODAFONE IDEA LIMITED", "NIPPON L I A M LTD",
"INDIABULLS HSG FIN LTD"), TATAMOTORS = c("TATAPOWER", "TATASTEEL",
"TCS", "TECHM", "TITAN", "TORNTPHARM", "TORNTPOWER", "TRENT",
"TVSMOTOR", "UBL", "ULTRACEMCO", "UPL", "VEDL", "VOLTAS", "WHIRLPOOL",
"WIPRO", "ZEEL", "BALKRISIND", "BHARATFORG", "BPCL", "EICHERMOT",
"GODREJPROP", "HINDPETRO", "JKCEMENT", "NESTLEIND", "METROPOLIS",
"HINDUNILVR", "IDEA", "NAM-INDIA", "IBULHSGFIN"), X4 = c(4.5,
4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5,
4.5, 4.5, 3, 4.5, 4.5, 4.5, 4.5, 4, 4.5, 4.5, 4.5, 4.5, 4.5,
4.5, 4.5, 3)), class = "data.frame", row.names = c(NA, -30L)))

unlist(IDTpdf) is not working. It makes IDTpdf a lot messier...

I want IDTpdf to be one data frame combining all the four pages in the above url...

ANy help will be greatly appreciated...

Thanking you,
Yours sincerely,
AKSHAY M KULKARNI


	[[alternative HTML version deleted]]



More information about the R-help mailing list