In [2]:
import requests 
import lxml.html as lh
import pandas as pd
from datetime import datetime
from sqlalchemy import create_engine
import pymysql
import mysql.connector as sql
import boto3
from io import StringIO
In [4]:
#Expand our view so we can see all the columns of data 
pd.set_option('display.max_columns', None)
In [5]:
#We leverage the realgm page around Canadians in the NBA
#Essentially the URL we want to scrape 
url = 'https://basketball.realgm.com/national/teams/23/Canada/nba_players'
In [6]:
page = requests.get(url)#Store the contents of the website under doc
doc = lh.fromstring(page.content)#Parse data that are stored between <tr>..</tr> of HTML

tr_elements = doc.xpath('//tr')
    
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0#For each row, store each first element (header) and an empty list
In [7]:
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    '%d:"%s"'%(i,name)
    col.append((name,[]))
    
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
        
    #If row is not of size 13, the //tr data is not from our table 
    if len(T)!=13:
        break
        
    #i is the index of our column
    i=0
        
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1
        
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

row = df[df['Player'] == 'Player'].index.tolist()[0]

df = df.iloc[:row]
In [8]:
df
Out[8]:
Player Pos HT WT Birth Date Team GP MPG PPG RPG APG SPB BPG
0 Nickeil Alexander-Walker SG 6-5 205 Sep 2, 1998 New Orleans Pelicans 11 20.08 11.09 3.00 1.82 1.00 0.18
1 R.J. Barrett SG 6-7 202 Jun 14, 2000 New York Knicks 15 37.05 17.13 7.60 3.40 0.80 0.33
2 Khem Birch PF 6-9 230 Sep 28, 1992 Orlando Magic 14 21.41 7.64 6.14 1.07 0.93 0.36
3 Chris Boucher PF 6-10 200 Jan 11, 1993 Toronto Raptors 13 23.84 16.08 7.00 1.08 0.62 2.54
4 Ignas Brazdeikis SF 6-7 221 Jan 8, 1999 New York Knicks 4 1.77 0.50 0.50 0.25 0.00 0.00
5 Dillon Brooks F 6-6 220 Jan 22, 1996 Memphis Grizzlies 13 29.27 15.15 3.92 3.08 1.38 0.38
6 Brandon Clarke PF 6-8 210 Sep 19, 1996 Memphis Grizzlies 13 29.30 13.23 6.08 1.85 1.08 0.69
7 Nate Darling SG 6-5 200 Aug 30, 1998 Charlotte Hornets - - - - - - -
8 Luguentz Dort PG 6-4 215 Apr 19, 1999 Oklahoma City Thunder 13 28.27 13.38 3.92 0.92 1.15 0.31
9 Shai Gilgeous-Alexander PG 6-6 181 Jul 12, 1998 Oklahoma City Thunder 13 33.36 21.00 5.31 6.23 0.85 0.46
10 Cory Joseph PG 6-3 200 Aug 20, 1991 Sacramento Kings 14 20.57 6.43 2.21 2.21 0.64 0.36
11 Mfiondu Kabengele SF 6-10 250 Aug 14, 1997 Los Angeles Clippers 9 4.93 0.89 0.67 0.22 0.22 0.22
12 Trey Lyles SF 6-10 235 Nov 5, 1995 San Antonio Spurs 7 12.18 2.29 3.71 0.43 0.00 0.00
13 Karim Mane PG 6-5 185 May 16, 2000 Orlando Magic 6 9.31 1.17 1.67 0.17 0.00 0.17
14 Mychal Mulder GF 6-3 184 Jun 12, 1994 Golden State Warriors 12 10.30 4.08 1.00 0.50 0.17 0.08
15 Jamal Murray SG 6-4 215 Feb 23, 1997 Denver Nuggets 13 34.66 19.15 3.46 4.00 1.00 0.15
16 Kelly Olynyk C 7-0 240 Apr 19, 1991 Miami Heat 11 28.70 10.91 5.27 2.55 0.91 0.91
17 Dwight Powell PF 6-10 240 Jul 20, 1991 Dallas Mavericks 9 19.72 5.00 3.56 1.22 1.22 0.56
18 Tristan Thompson F 6-9 254 Mar 13, 1991 Boston Celtics 10 23.14 7.40 8.50 0.70 0.40 0.80
19 Andrew Wiggins F 6-8 194 Feb 23, 1995 Golden State Warriors 13 32.71 17.77 4.54 2.38 0.46 1.62
In [ ]: