'Need the number of total pages on a website to iterate but selenium keeps timing out

i'm triying to fix a data crawler that until last couple of weeks was working perfectly. The script consist of two parts, one that retrieves the links of the articles and the other does the scraping of the specs of such articles, the site is https://www.fravega.com/ . The problem i'm having is in the first part, where the script gets the total amount of pages in the site to use it as a iterator for the link retrieving loop. Below the code in cuestion:

 def get_links(postal_code, section):
    '''FUNCION QUE DEVUELVE UNA LISTA DE LOS LINKS DE ARTICULOS SEGUN EL CODIGO POSTAL, LA SECCION, 
    LA CANTIDAD DE PAGINAS Y EL SITIO WEB QUE LOS CONTIENE'''
    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument('--headless')
    # chrome_options.add_argument('--no-sandbox')
    # chrome_options.add_argument('--disable-dev-shm-usage')
    # chrome_options.add_argument("start-maximized")
    # chrome_options.add_argument("enable-automation")
    # chrome_options.add_argument("--disable-infobars")
    # chrome_options.add_argument("--disable-browser-side-navigation")
    # chrome_options.add_argument("--disable-gpu")

    driver =webdriver.Chrome('chromedriver',chrome_options=chrome_options)
    wait = WebDriverWait(driver, 60)
    driver.get("https://www.fravega.com/")
    element = wait.until(EC.visibility_of_element_located((By.ID, "header-geo-location-form-postal-number"))) # Obtengo el codigo postal
    element.send_keys(postal_code) # Escribe el codigo
    wait.until(lambda driver: element.get_attribute('value') == postal_code)
    element.submit() # Envia el formulario
    
    wait.until(EC.presence_of_element_located((By.LINK_TEXT, section))).click() # Halla el texto de la seccion y lo clickea
   

    wait.until(EC.visibility_of_element_located((By.XPATH, "//li[contains(@class,'sc-efd39989-1 laPjUm')]")))
    # wait.until(EC.visibility_of_element_located((By.ID,'pagination-next-button')))
    # wait.until(EC.presence_of_element_located((By.CLASS_NAME,'sc-efd39989-0 gUwmHE')))
    
    pags = int(driver.find_elements_by_xpath("//li[contains(@class,'sc-efd39989-1 laPjUm')]/a")[-1].text)
    # pags = int(wait.until(EC.visibility_of_element_located((By.XPATH, "//li[contains(@class,'sc-efd39989-1 laPjUm')]"))).text)

    print(pags)
    links = []

    for n in tqdm(range(1, pags+1), initial=1):
        wait.until(EC.visibility_of_element_located((By.XPATH, "//ul[contains(@class,'sc-e1732e90-0 fJzBdi')]")))
        events = driver.find_elements_by_xpath("//ul[contains(@class,'sc-e1732e90-0 fJzBdi')]/li") # Encuentra los elementos del grid de articulos
        for event in events:
            link = event.find_element_by_tag_name('a').get_attribute('href') # Halla el link de cada articulo
            links.append(link) # Lo adjunto a la lista
        try:
            driver.find_element_by_link_text('Siguiente >').click() # Click en siguiente
        except NoSuchElementException: # Sino hay tal elemento, se rompe el for
            break
    
    driver.close() 
    driver.quit() # Se cierra el browser

    return links

The line that timed out selenium is this one:

wait.until(EC.visibility_of_element_located((By.XPATH, "//li[contains(@class,'sc-efd39989-1 laPjUm')]")))

As you can see in the comented lines below you can see that i've tried other methods but everything i've tried so far keep timing out selenium. I wanna know if there is someway of making this work or if there is some other way to get the total number of pages. Thanks



Solution 1:[1]

The reason the script keeps timing out is simple: the links to the next pages only load after you scroll down to the bottom of the page. Which is also likely why the script stopped working.

So you basically have to first, wait until the page with the items loads. I am using the filters form style attribute, but you can use anything else (including a simple sleep)

    # wait until page loads. You can wait for a specific element here
    filter_form_selector = '[style="grid-area:filters-form"]'
    wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, filter_form_selector)))

Then, I scroll to the very bottom of the page You could also scroll more naturally by submitting a page down key press multiple times in a loop.

    # scroll to the bottom of the page
    driver.execute_script('window.scrollTo(0, 1000000000);')

I then wait for links to pages to appear

    page_count_css_selector = '[data-type=page]'
    wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, page_count_css_selector)))

And finally, I retrieve the last page button and read its text to see the total number of pages

    pages_elements = driver.find_elements(By.CSS_SELECTOR, page_count_css_selector)
    last_page = pages_elements[len(pages_elements) - 1]
    pages = int(last_page.text)

    print(pages)

Here's the full working code:

import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


def get_links(postal_code, section):
    chromedriver_autoinstaller.install()

    '''FUNCION QUE DEVUELVE UNA LISTA DE LOS LINKS DE ARTICULOS SEGUN EL CODIGO POSTAL, LA SECCION,
    LA CANTIDAD DE PAGINAS Y EL SITIO WEB QUE LOS CONTIENE'''
    chrome_options = Options()
    chrome_options.headless = False

    driver = webdriver.Chrome('chromedriver', chrome_options=chrome_options)
    wait = WebDriverWait(driver, 60)
    driver.get("https://www.fravega.com/")
    element = wait.until(
        EC.visibility_of_element_located((By.ID, "header-geo-location-form-postal-number")))  # Obtengo el codigo postal
    element.send_keys(postal_code)  # Escribe el codigo
    wait.until(lambda driver: element.get_attribute('value') == postal_code)
    element.submit()  # Envia el formulario

    wait.until(
        EC.presence_of_element_located((By.LINK_TEXT, section))).click()  # Halla el texto de la seccion y lo clickea

    # wait until page loads. You can wait for a specific element here
    filter_form_selector = '[style="grid-area:filters-form"]'
    wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, filter_form_selector)))

    # scroll to the bottom of the page
    driver.execute_script('window.scrollTo(0, 1000000000);')

    page_count_css_selector = '[data-type=page]'
    wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, page_count_css_selector)))

    pages_elements = driver.find_elements(By.CSS_SELECTOR, page_count_css_selector)
    last_page = pages_elements[len(pages_elements) - 1]
    pages = int(last_page.text)

    print(pages)

I would also advise you to setup a proxy for selenium to hide your IP, since the shop you're trying to scrape might eventually block your IP from accessing its data as they are clearly aware of your scraping attempts.

Hope that helps!

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Zyy