Uploaded code for scrapping https://leagueoflegends.fandom.com/.

allChampions_scrapper.py gets a list of all available champions.
goldBounty_scrapper.py gets info about gold rewards associated to kills.
patchInfo_scrapper.py gets the URLs storing each patch's relevant information.
patchInfoDetail_scrapper.py (not fully developed) extracts the data from the specific patches' URLs.
parent 1d1082e2
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 11 22:00:02 2020
@author: Alberto
"""
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
save_locations = [''] # put other paths if necessary
driver = webdriver.Chrome("chromedriver.exe")
driver.get("https://leagueoflegends.fandom.com/wiki/List_of_champions")
champion_names = []
content = driver.page_source
soup = BeautifulSoup(content, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable jquery-tablesorter'})
table_rows = table.findAll('td', {'style': 'text-align:left;'})
for i in table_rows:
champion_names.append(i['data-sort-value'])
driver.close()
all_champions = pd.DataFrame({'Champion Name': champion_names})
for path in save_locations:
all_champions.to_csv(path + 'all_champions.csv', index=False, sep=',')
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 11 13:07:28 2020
@author: Alberto
"""
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
save_locations = [''] # put other paths if necessary
driver = webdriver.Chrome("chromedriver.exe")
driver.get("https://leagueoflegends.fandom.com/wiki/Kill")
def formatString(string, input_type):
string = string.replace('\n', '')
string = string.replace('\xa0', '')
if len(string) == 0:
string = np.nan
elif len(string) > 1:
if input_type == 0:
string = '-' + string[1]
elif input_type == 1 or input_type == 2:
string = string[0]
return string
tier = []
consec_kills = []
consec_deaths = []
kill_bounty = []
assist_bounty = []
content = driver.page_source
soup = BeautifulSoup(content, 'html.parser')
table = soup.find('table', {'class': 'article-table'})
table_headers = table.findAll('th')
table_rows = table.findAll('tr')
for i in table_rows:
for idx, j in enumerate(i.findAll('td')):
if idx == 0:
got = j.get_text()
got = formatString(got, 0)
tier.append(float(got))
elif idx ==1:
got = j.get_text()
got = formatString(got, 1)
consec_kills.append(float(got))
elif idx ==2:
got = j.get_text()
got = formatString(got, 2)
consec_deaths.append(float(got))
elif idx ==3:
got = j.find('span', {'style': 'white-space:normal;'}).get_text()
kill_bounty.append(float(got))
elif idx ==4:
got = j.find('span', {'style': 'white-space:normal;'}).get_text()
assist_bounty.append(float(got))
driver.close()
gold_info = pd.DataFrame({'Tier': tier,'Consecutive Kills': consec_kills,
'Consecutive Deaths': consec_deaths, 'Kill Bounty': kill_bounty,
'Assist Bounty': assist_bounty})
for path in save_locations:
gold_info.to_csv(path + 'gold_info.csv', index=False, sep=',')
This diff is collapsed.
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 11 21:04:28 2020
@author: Alberto
"""
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import re
month_dict = {'January': '1',
'February': '2',
'March': '3',
'April': '4',
'May': '5',
'June': '6',
'July': '7',
'August': '8',
'September': '9',
'October': '10',
'November': '11',
'December': '12'}
def formatString(string):
string = string.replace('\n', '')
string = string.replace('\xa0', '')
return string
def returnDate(date):
month = month_dict[formatString(date[0])]
year = date[2]
day = re.search('[0-9]*', date[1]).group(0)
return day +'-'+ month +'-'+ year
patch_info = pd.read_csv('patch_info.csv')
champion_names = pd.read_csv('all_champions.csv')
save_locations = ['', 'C:\\Users\\Alberto\\Desktop\\UNED\\2O CUATRIMESTRE\\VD\\data\\scrapped\\'] # put other paths if necessary
patch_id = []
champion = []
patch_date = []
change_description = []
for i in range(1):#len(patch_info)):
url_check = patch_info.loc[i, 'Info URL']
driver = webdriver.Chrome("chromedriver.exe")
driver.get(url_check)
content = driver.page_source
soup = BeautifulSoup(content, 'html.parser')
patch_id.append(soup.find('h1', {'class': 'page-header__title'}).get_text())
date_info = soup.find('td', {'class': 'pi-horizontal-group-item pi-data-value pi-font pi-border-color pi-item-spacing'}).get_text().split(' ')
patch_date.append(returnDate(date_info))
champion_info = soup.findAll('dl')
first_added = False # There may be a more elegant way to do this
for idx, j in enumerate(champion_info):
champion_name = j.find('span', {'style': 'white-space:normal;'})
if champion_name is not None:
champion_name = champion_name.get_text()
if champion_name in champion_names['Champion Name'].tolist():
#changes = soup.findAll('ul')
changes =j.find_next_sibling('ul')
print(changes)
print("____________________")
"""
for k in changes:
summary = {}
print(k.find('ul'))
"""
if first_added == True:
patch_id.append(patch_id[-1])
patch_date.append(patch_date[-1])
champion.append(champion_name)
first_added = True
driver.close()
patchInfo_detail = pd.DataFrame({'ID': patch_id, 'Champion': champion, 'Date': patch_date})
for path in save_locations:
patchInfo_detail.to_csv(path + 'patchInfo_detail.csv', index=False, sep=',')
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 11 17:48:01 2020
@author: Alberto
"""
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
save_locations = [''] # put other paths if necessary
driver = webdriver.Chrome("chromedriver.exe")
driver.get("https://leagueoflegends.fandom.com/wiki/Patch")
base_patch_url = 'https:\\leagueoflegends.fandom.com'
lookup_data = {'Season Five': 2015, 'Season Six': 2016, 'Season Seven': 2017,
'Season Eight': 2018}
season_year = []
info_url = []
patch_id = []
content = driver.page_source
soup = BeautifulSoup(content, 'html.parser')
table = soup.find('div', {'class': 'va-collapsible-content mw-collapsible-content'})
table_rows = table.findAll('tr')
for i in table_rows:
match = False
patch_id_aux = None
for idx, j in enumerate(i.findAll('th')):
if idx == 0:
got = j.find('a')
if got is not None:
got = got.get_text()
if got in lookup_data:
match = True
season_year.append(lookup_data[got])
elif idx == 1 and match == True:
got = j.get_text()
got = got.replace('\n', '')
got = got.replace(' ', '')
patch_id_aux = got
if match == True:
patches = i.find('td')
for idx, j in enumerate(patches.findAll('li')):
got = j.find('a', href=True)
patch_rest_url = got['href']
patch_rest_id = got.get_text()
patch_id.append(patch_id_aux.replace('x', patch_rest_id))
info_url.append(base_patch_url+patch_rest_url.replace('/', '\\'))
if idx > 0:
season_year.append(season_year[-1])
driver.close()
patch_info = pd.DataFrame({'Season Year': season_year, 'Info URL': info_url,
'ID': patch_id})
for path in save_locations:
patch_info.to_csv(path + 'patch_info.csv', index=False, sep=',')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment