'Web Scrapping Selenium , Multiple pages and product issue
i am doing scrapping using selenium but not able to get all the href of 25 pages and all 626 products listed by getting all the href of products and multiple features from product and i want to scrape all the products on the 25 pages . but while extracting all the 25 pages href it only gives 1 to 7 then jump to 25 directly not able to get all 25 pages links . and product listed there.
then i click on product link by sending keys and storing the href of all the products in url of pages .
import selenium
import pandas as pd
from selenium import webdriver
import getpass, time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException,StaleElementReferenceException
#First we will connect to webdriver
driver=webdriver.Chrome(r'/Users/ankit/chromedriver')
#Open the webpage with webdriver
driver.get('https://www.getapp.com/hr-employee-management-software/human-resources/')om/hr-employee-management-software/human-resources/')
URL2 = [] # for product pages
URL = [] # for storing all the pages
URL3=[] # for storing all video links
for i in range(1, 28):
URL.append(
f"https://www.getapp.com/hr-employee-management-software/human-resources/page-{i}/")
# visiting all the pages and scraping the products/Read More About... Links
for p in URL:
driver.get(p)
for i in driver.find_elements_by_xpath(
'//a[@data-testid="listing-item_text-link_read-more-about-product"]'
):
URL2.append(i.get_attribute("href"))
for i in URL2:
try:
wait = WebDriverWait(
driver, 5
) # time waiting for element to be found or accessable [Wait variable use below]
driver.get(i) # going through each page
elements = driver.find_elements_by_xpath("//img[contains(@src,'ytimg')]")
for element in elements[0:1]:
while True: # making videos properly available for clicking the right arrow
try:
element.click()
break
except Exception as e:
elemt = wait.until(
EC.element_to_be_clickable(
(By.XPATH, '//button[@data-evac="slide-to_right"]/div')
)
)
elemt.click()
time.sleep(0.7)
driver.implicitly_wait(3)
try:
URL3.append(
driver.find_element_by_xpath(
'//iframe[contains(@id,"yt-player")]'
).get_attribute("src")
) # collecting and adding it up
except NoSuchElementException:
URL3.append('--')
elemt = wait.until(
EC.element_to_be_clickable((By.XPATH, '//div[@title="Close"]'))
)
elemt.click() # finally closing
except Exception as e:
print("failed" ,e, i)
#we will open 1st product link to get all the necessary paths.
click=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div/div[2]/div[2]/div[2]/div[2]/div[2]/a/p").click()
NAME=[]
OVERVIEW=[]
Image_url1=[]
Image_url2=[]
Image_url3=[]
Image_url4=[]
Image_url5=[]
#extracting and storing the Features of the product
FEATURE1=[]
FEATURE2=[]
FEATURE3=[]
FEATURE4=[]
FEATURE5=[]
PRICING=[]
for i in URL2:
driver.get(i)
try:
name=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/h2/span")
NAME.append(name.text.replace('product overview', '-'))
except NoSuchElementException:
NAME.append('--')
try:
overview=driver.find_element_by_xpath('//*[@id="__next"]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[1]/div/div[2]/p')
OVERVIEW.append(overview.text)
except NoSuchElementException:
OVERVIEW.append('--')
try:
i=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/div[1]/img")
Image_url1.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url1.append('--')
try:
i=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/div[1]/img")
Image_url2.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url2.append('--')
try:
i=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/div[2]/img")
Image_url3.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url3.append('--')
try:
i=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/div[3]/img")
Image_url4.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url4.append('--')
try:
i=driver.find_element_by_tag_name("img")
Image_url5.append(i.get_attribute("src"))
except NoSuchElementException:
Image_url5.append('--')
try:
feature1=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[1]/div[1]/div")
FEATURE1.append(feature1.text)
except NoSuchElementException:
FEATURE1.append('--')
try:
feature2=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[1]/div[2]/div")
FEATURE2.append(feature2.text)
except NoSuchElementException:
FEATURE2.append('--')
try:
feature3=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[1]/div[3]/div")
FEATURE3.append(feature3.text)
except NoSuchElementException:
FEATURE3.append('--')
try:
feature4=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[1]/div[4]/div")
FEATURE4.append(feature4.text)
except NoSuchElementException:
FEATURE4.append('--')
try:
feature5=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[3]/div/div[1]/div/div[2]/div/div[1]/div[2]/div[1]/div")
FEATURE5.append(feature4.text)
except NoSuchElementException:
FEATURE5.append('--')
try:
Pricing=driver.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/section[1]/div/div[1]/div/div[1]/div[1]/div/div[1]/div/div[1]/div[2]/div[1]/div/p[1]")
PRICING.append( Pricing.text)
except NoSuchElementException:
PRICING.append('--') ```
Solution 1:[1]
You are not getting all the pages because the pagination is dynamically loaded on the website. You need to click on the pagination to load the other pages(and the href/link of those pages).
But a smart way would be to make the URLs manually rather than scraping because they are similar. like this :
URL =[]
for i in range(1,27):
URL.append(f"https://www.getapp.com/hr-employee-management-software/human-resources/page-{i}/")
I understood that your next goal is to click on the Read More About.... But here is what you are doing wrong/ making some inefficient approach. After entering the first page, you immediately clicked on the Read More About .... Instead, scrape all the Read More About... links PER PAGE. Then visit these scraped links one by one for the features.
Here is my complete approach:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
URL2 = [] # for product pages
URL = [] # for storing all the pages
for i in range(1, 27):
URL.append(
f"https://www.getapp.com/hr-employee-management-software/human-resources/page-{i}/"
)
# visiting all the pages and scraping the products/Read More About... Links
for p in URL:
driver.get(p)
for i in driver.find_elements_by_xpath(
'//a[@data-testid="listing-item_text-link_read-more-about-product"]'
):
URL2.append(i.get_attribute("href"))
# then collect the features by visiting the URL2 list
It seems like videos are at the end of the preview section, and the links for the videos are not visible directly. They are available when they are clicked because they are embedded.
To achieve our goal, we can take these steps.
- Make them properly visible for clicking
- Click on the videos (Some Products have multiple)
- Extract Links from the iframe.
- Close the video preview panel (because products that have multiple videos need to be properly visible before clicking the other video ).
Code for this approach(steps explained with comments)
for ul in URL2:
try:
wait = WebDriverWait(
driver, 5
) # time waiting for element to be found or accessable [Wait variable use below]
driver.get(ul) # going through each page
elements = driver.find_elements_by_xpath("//img[contains(@src,'ytimg')]")
for element in elements[0:1]: # use limit here for number of video links
while True: # making videos properly available for clicking the right arrow
try:
element.click()
break
except Exception as e:
elemt = wait.until(
EC.element_to_be_clickable(
(By.XPATH, '//button[@data-evac="slide-to_right"]/div')
)
)
elemt.click()
time.sleep(0.7)
driver.implicitly_wait(10)
URL3.append(
driver.find_element_by_xpath(
'//iframe[contains(@id,"yt-player")]'
).get_attribute("src")
) # collecting and adding it up
elemt = wait.until(
EC.element_to_be_clickable((By.XPATH, '//div[@title="Close"]'))
)
elemt.click() # finally closing
except Exception as e:
print("failed" ,e, ul)
NOTE: In the case of iframe (in selenium) we need to switch to the iframe or handle it in a diffrent way. But luckily for you video links are available outside the iframe.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 |