'Multiple requests to Scrape different pages is giving the same result
I want to scrape the ICC cricket website and find the rankings of batsmen on a particular date. The problem is that the result that I'm getting is the same for all the dates. The scraping is giving me the list of the most recent rankings rather than the ranking at a particular date. The code is given below. Can someone tell me why this is happening/ a solution for the same? I feel that the problem is that beautifulsoup is not letting the page load completely which is in turn giving false information as the information required is received after applying filters on the website.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import datetime
import os
date_list = pd.date_range(start = "1971-02-01", end=datetime.date.today(), freq='1d')
def get_batsmen(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting?at={date}'
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_bowler(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling?at={date}'
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_allrounder(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/all-rounder?at={date}'
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
#Storing the data into multiple csvs
for date in date_list:
year = date.year
month = date.month
day = date.day
newpath = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}'
if not os.path.exists(newpath):
os.makedirs(newpath)
newpath1 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}'
if not os.path.exists(newpath1):
os.makedirs(newpath1)
newpath2 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}\{day}'
if not os.path.exists(newpath2):
os.makedirs(newpath2)
get_batsmen(date).to_csv(newpath2+'/batsmen.csv')
get_bowler(date).to_csv(newpath2+'/bowler.csv')
get_allrounder(date).to_csv(newpath2+'/allrounder.csv')
Solution 1:[1]
In case the website that you're scraping is interactive it can be good to look at Selenium as scraping package instead of bs4 so javascript is executed.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | Arno Maeckelberghe |