'Need the number of total pages on a website to iterate but selenium keeps timing out
i'm triying to fix a data crawler that until last couple of weeks was working perfectly. The script consist of two parts, one that retrieves the links of the articles and the other does the scraping of the specs of such articles, the site is https://www.fravega.com/ . The problem i'm having is in the first part, where the script gets the total amount of pages in the site to use it as a iterator for the link retrieving loop. Below the code in cuestion:
def get_links(postal_code, section):
'''FUNCION QUE DEVUELVE UNA LISTA DE LOS LINKS DE ARTICULOS SEGUN EL CODIGO POSTAL, LA SECCION,
LA CANTIDAD DE PAGINAS Y EL SITIO WEB QUE LOS CONTIENE'''
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--disable-dev-shm-usage')
# chrome_options.add_argument("start-maximized")
# chrome_options.add_argument("enable-automation")
# chrome_options.add_argument("--disable-infobars")
# chrome_options.add_argument("--disable-browser-side-navigation")
# chrome_options.add_argument("--disable-gpu")
driver =webdriver.Chrome('chromedriver',chrome_options=chrome_options)
wait = WebDriverWait(driver, 60)
driver.get("https://www.fravega.com/")
element = wait.until(EC.visibility_of_element_located((By.ID, "header-geo-location-form-postal-number"))) # Obtengo el codigo postal
element.send_keys(postal_code) # Escribe el codigo
wait.until(lambda driver: element.get_attribute('value') == postal_code)
element.submit() # Envia el formulario
wait.until(EC.presence_of_element_located((By.LINK_TEXT, section))).click() # Halla el texto de la seccion y lo clickea
wait.until(EC.visibility_of_element_located((By.XPATH, "//li[contains(@class,'sc-efd39989-1 laPjUm')]")))
# wait.until(EC.visibility_of_element_located((By.ID,'pagination-next-button')))
# wait.until(EC.presence_of_element_located((By.CLASS_NAME,'sc-efd39989-0 gUwmHE')))
pags = int(driver.find_elements_by_xpath("//li[contains(@class,'sc-efd39989-1 laPjUm')]/a")[-1].text)
# pags = int(wait.until(EC.visibility_of_element_located((By.XPATH, "//li[contains(@class,'sc-efd39989-1 laPjUm')]"))).text)
print(pags)
links = []
for n in tqdm(range(1, pags+1), initial=1):
wait.until(EC.visibility_of_element_located((By.XPATH, "//ul[contains(@class,'sc-e1732e90-0 fJzBdi')]")))
events = driver.find_elements_by_xpath("//ul[contains(@class,'sc-e1732e90-0 fJzBdi')]/li") # Encuentra los elementos del grid de articulos
for event in events:
link = event.find_element_by_tag_name('a').get_attribute('href') # Halla el link de cada articulo
links.append(link) # Lo adjunto a la lista
try:
driver.find_element_by_link_text('Siguiente >').click() # Click en siguiente
except NoSuchElementException: # Sino hay tal elemento, se rompe el for
break
driver.close()
driver.quit() # Se cierra el browser
return links
The line that timed out selenium is this one:
wait.until(EC.visibility_of_element_located((By.XPATH, "//li[contains(@class,'sc-efd39989-1 laPjUm')]")))
As you can see in the comented lines below you can see that i've tried other methods but everything i've tried so far keep timing out selenium. I wanna know if there is someway of making this work or if there is some other way to get the total number of pages. Thanks
Solution 1:[1]
The reason the script keeps timing out is simple: the links to the next pages only load after you scroll down to the bottom of the page. Which is also likely why the script stopped working.
So you basically have to first, wait until the page with the items loads. I am using the filters form style attribute, but you can use anything else (including a simple sleep)
# wait until page loads. You can wait for a specific element here
filter_form_selector = '[style="grid-area:filters-form"]'
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, filter_form_selector)))
Then, I scroll to the very bottom of the page You could also scroll more naturally by submitting a page down key press multiple times in a loop.
# scroll to the bottom of the page
driver.execute_script('window.scrollTo(0, 1000000000);')
I then wait for links to pages to appear
page_count_css_selector = '[data-type=page]'
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, page_count_css_selector)))
And finally, I retrieve the last page button and read its text to see the total number of pages
pages_elements = driver.find_elements(By.CSS_SELECTOR, page_count_css_selector)
last_page = pages_elements[len(pages_elements) - 1]
pages = int(last_page.text)
print(pages)
Here's the full working code:
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def get_links(postal_code, section):
chromedriver_autoinstaller.install()
'''FUNCION QUE DEVUELVE UNA LISTA DE LOS LINKS DE ARTICULOS SEGUN EL CODIGO POSTAL, LA SECCION,
LA CANTIDAD DE PAGINAS Y EL SITIO WEB QUE LOS CONTIENE'''
chrome_options = Options()
chrome_options.headless = False
driver = webdriver.Chrome('chromedriver', chrome_options=chrome_options)
wait = WebDriverWait(driver, 60)
driver.get("https://www.fravega.com/")
element = wait.until(
EC.visibility_of_element_located((By.ID, "header-geo-location-form-postal-number"))) # Obtengo el codigo postal
element.send_keys(postal_code) # Escribe el codigo
wait.until(lambda driver: element.get_attribute('value') == postal_code)
element.submit() # Envia el formulario
wait.until(
EC.presence_of_element_located((By.LINK_TEXT, section))).click() # Halla el texto de la seccion y lo clickea
# wait until page loads. You can wait for a specific element here
filter_form_selector = '[style="grid-area:filters-form"]'
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, filter_form_selector)))
# scroll to the bottom of the page
driver.execute_script('window.scrollTo(0, 1000000000);')
page_count_css_selector = '[data-type=page]'
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, page_count_css_selector)))
pages_elements = driver.find_elements(By.CSS_SELECTOR, page_count_css_selector)
last_page = pages_elements[len(pages_elements) - 1]
pages = int(last_page.text)
print(pages)
I would also advise you to setup a proxy for selenium to hide your IP, since the shop you're trying to scrape might eventually block your IP from accessing its data as they are clearly aware of your scraping attempts.
Hope that helps!
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | Zyy |