'Google Maps data scraping using Puppeteer JS NODE JS error issue
I am trying to scrape data from Google Maps. I have written a Puppeteer JS code and run it with NODE JS. I am receiving this error.
Recieved an error, attempting to move on...
(node:6708) UnhandledPromiseRejectionWarning: TypeError: (intermediate value) is not iterable (cannot read property undefined)
at main (C:\Users\emrah\OneDrive\Desktop\pups\google.js:133:18)
(Use `node --trace-warnings ...` to show where the warning was created)
(node:6708) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not
handled with .catch(). To terminate the node process on unhandled promise rejection, use the CLI flag `--unhandled-rejections=strict` (see https://nodejs.org/api/cli.html#cli_unhandled_rejections_mode). (rejection id: 1)
(node:6708) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.
My code is below. Can you please help with what I am missing? Thanks
const puppeteer = require('puppeteer'); /// import puppeteer from "puppeteer";
const defaultDelay = 300; // Increase this if running on a laggy browser or device
let debugBool = true;
let debug = {
log: (...strings) => debugBool && console.log(strings.join(' ')),
};
const xlsx = require('xlsx');
// Get the data
async function getPageData(url, page) {
await page.goto(url);
await page.waitForSelector('[role="main"]').catch(movingOn);
//Shop Name
let shopName =
(await page.$eval('[role="main"]', element =>
element.getAttribute('aria-label')
)) || 'No shop name provided';
//Shop Address
let address =
(await page.$eval(
'button[data-item-id="address"]',
element => element.innerText
)) || 'Delivery service (No address)';
//Website
let website =
(await page.$eval(
'[data-tooltip="Open website"]',
element => element.innerText
)) || 'No website provided';
let returnObj = {
shop: shopName?.trim?.(),
address: address?.trim?.(),
website: website?.trim?.(),
};
console.log(returnObj);
return returnObj;
//await browser.close();
}
//Get Links
async function getLinks(page) {
// Scrolling to bottom of page
let newScrollHeight = 0;
let scrollHeight = 1000;
let divSelector = '#pane > div > div > div > div > div:nth-child(4) > div';
debug.log('Waiting for the page to load in');
await page.waitForTimeout(defaultDelay * 11);
debug.log('Starting to scroll now');
while (true) {
await page.waitForSelector(divSelector).catch();
await page.evaluate(
(scrollHeight, divSelector) =>
document.querySelector(divSelector).scrollTo(0, scrollHeight),
scrollHeight,
divSelector
);
await page.waitForTimeout(defaultDelay);
newScrollHeight = await page.$eval(
divSelector,
div => div.scrollHeight
);
debug.log('scrolled by', newScrollHeight);
if (scrollHeight === newScrollHeight) {
break;
} else {
scrollHeight = newScrollHeight;
}
}
debug.log('finished scrolling');
// Get results
const searchResults = await page.evaluate(() =>
Array.from(document.querySelectorAll('a'))
.map(el => el.href)
.filter(
link =>
link.match(/https:\/\/www.google.com\/maps\//g, link) &&
!link.match(/\=https:\/\/www.google.com\/maps\//g, link)
)
);
console.log(searchResults);
debug.log('I got', searchResults.length, 'results');
return searchResults;
}
async function isNextButtonDisabled(page) {
let state = await page.$eval('button[aria-label=" Next page "]', button =>
button.getAttribute('disabled') ? true : false
);
debug.log(
'We are',
state ? ' at the end of the pages' : 'not at the end of the pages'
);
return state;
}
function movingOn() {
debug.log('Wait timed out, moving on...');
}
function genericMovingOn() {
debug.log('Recieved an error, attempting to move on...');
}
async function main(searchQuery = 'flower shop des moines Iowa') {
const browser = await puppeteer.launch({ headless: false });
const [page] = await browser.pages();
await page.goto('https://www.google.com/maps/?q=' + searchQuery);
await page
.waitForNavigation({ waitUntil: 'domcontentloaded' })
.catch(movingOn);
await page.waitForTimeout(defaultDelay * 10);
let allLinks = [];
while (!(await isNextButtonDisabled(page).catch(genericMovingOn))) {
// If it hasn't go to the next page
allLinks.push(...(await getLinks(page).catch(genericMovingOn)));
await page
.$eval('button[aria-label=" Next page "]', element =>
element.click()
)
.catch(genericMovingOn);
debug.log('moving to the next page');
if (await isNextButtonDisabled(page).catch(genericMovingOn)) break;
await page
.waitForNavigation({ waitUntil: 'domcontentloaded' })
.catch(movingOn);
}
allLinks = Array.from(new Set(allLinks));
console.log(allLinks);
let scrapedData = [];
for (let i = 0; i < allLinks.length; i++) {
let link = allLinks[i];
let data = await getPageData(link, page).catch(genericMovingOn);
scrapedData.push(data);
}
scrapedData = scrapedData.filter(Boolean)
const wb = xlsx.utils.book_new();
const ws = xlsx.utils.json_to_sheet(scrapedData);
xlsx.utils.book_append_sheet(wb,ws), {origin: -1};
xlsx.writeFile(wb,"flowershop.xlsx");
console.log(scrapedData);
debug.log("Scrape complete!")
}
console.clear();
main();
Solution 1:[1]
The problem in your code was a few wrong selectors. I fixed them, and also rewrote your combination of async/await approach with then chains, since you need to use one of those. And a few more things... Check code in online IDE
const puppeteer = require("puppeteer"); /// import puppeteer from "puppeteer";
const defaultDelay = 1000; // Increase this if running on a laggy browser or device
const debugBool = true;
const debug = {
log: (...strings) => debugBool && console.log(strings.join(" ")),
};
// const xlsx = require('xlsx');
// Get the data
async function getPageData(url, page) {
await page.goto(url);
try {
await page.waitForSelector('[role="main"]');
} catch (e) {
movingOn();
}
//Shop Name
const shopName = (await page.$eval('[role="main"]', (element) => element.getAttribute("aria-label"))) || "No shop name provided";
//Shop Address
const address = (await page.$eval('button[data-item-id="address"]', (element) => element.innerText)) || "Delivery service (No address)";
//Website
const website = (await page.$eval('[data-tooltip="Open website"]', (element) => element.innerText)) || "No website provided";
const returnObj = {
shop: shopName?.trim(),
address: address?.trim(),
website: website?.trim(),
};
console.log(returnObj);
return returnObj;
//await browser.close();
}
//Get Links
async function getLinks(page) {
// Scrolling to bottom of page
let newScrollHeight = 0;
let scrollHeight = 1000;
let divSelector = "[role='main'] > div:nth-child(2) > div";
debug.log("Waiting for the page to load in");
await page.waitForTimeout(defaultDelay * 11);
debug.log("Starting to scroll now");
while (true) {
try {
await page.waitForSelector(divSelector);
} catch (e) {
movingOn();
}
await page.evaluate((scrollHeight, divSelector) => document.querySelector(divSelector).scrollTo(0, scrollHeight), scrollHeight, divSelector);
await page.waitForTimeout(defaultDelay);
newScrollHeight = await page.$eval(divSelector, (div) => div.scrollHeight);
debug.log("scrolled by", newScrollHeight);
if (scrollHeight === newScrollHeight) {
break;
} else {
scrollHeight = newScrollHeight;
}
}
debug.log("finished scrolling");
// Get results
const searchResults = await page.evaluate(() =>
Array.from(document.querySelectorAll("a"))
.map((el) => el.href)
.filter((link) => link.match(/https:\/\/www.google.com\/maps\//g, link) && !link.match(/\=https:\/\/www.google.com\/maps\//g, link))
);
console.log(searchResults);
debug.log("I got", searchResults.length, "results");
return searchResults;
}
async function isNextButtonDisabled(page) {
const state = await page.$eval('button[aria-label=" Next page "]', (button) => (button.getAttribute("disabled") ? true : false));
debug.log("We are", state ? " at the end of the pages" : "not at the end of the pages");
return state;
}
function movingOn() {
debug.log("Wait timed out, moving on...");
}
function genericMovingOn() {
debug.log("Recieved an error, attempting to move on...");
}
async function main(searchQuery = "flower shop des moines Iowa") {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const [page] = await browser.pages();
await page.goto("https://www.google.com/maps/?q=" + searchQuery);
try {
await page.waitForNavigation({ waitUntil: "domcontentloaded" });
} catch (e) {
movingOn();
}
await page.waitForTimeout(defaultDelay * 10);
let allLinks = [];
let isDisabled;
try {
isDisabled = await isNextButtonDisabled(page);
} catch (e) {
genericMovingOn();
}
while (!isDisabled) {
// If it hasn't go to the next page
try {
const links = await getLinks(page);
allLinks.push(...links);
await page.$eval('button[aria-label=" Next page "]', (element) => element.click());
debug.log("moving to the next page");
} catch (e) {
genericMovingOn();
}
try {
isDisabled = await isNextButtonDisabled(page);
} catch (e) {
genericMovingOn();
}
if (isDisabled) break;
try {
await page.waitForNavigation({ waitUntil: "domcontentloaded" });
} catch (e) {
movingOn();
}
}
allLinks = Array.from(new Set(allLinks));
console.log(allLinks);
let scrapedData = [];
for (let i = 0; i < allLinks.length; i++) {
const link = allLinks[i];
try {
const data = await getPageData(link, page);
scrapedData.push(data);
} catch (e) {
genericMovingOn();
}
}
// scrapedData = scrapedData.filter(Boolean)
// const wb = xlsx.utils.book_new();
// const ws = xlsx.utils.json_to_sheet(scrapedData);
// xlsx.utils.book_append_sheet(wb,ws), {origin: -1};
// xlsx.writeFile(wb,"flowershop.xlsx");
console.log(scrapedData);
debug.log("Scrape complete!");
}
console.clear();
main();
Output:
We are not at the end of the pages
Waiting for the page to load in
Starting to scroll now
scrolled by 1733
scrolled by 2478
scrolled by 3201
scrolled by 3201
finished scrolling
[
'https://www.google.com/maps/place/Flowerama+Des+Moines/data=!4m6!3m5!1s0x87ee980de8926543:0xf2b5d3bed00298a!8m2!3d41.5540183!4d-93.5972285!16s%2Fg%2F1tpll8wj?authuser=0&hl=en&rclk=1',
"https://www.google.com/maps/place/Irene's+Flowers/data=!4m6!3m5!1s0x87ee9939947d3f37:0x240aba7767b59599!8m2!3d41.5995484!4d-93.6507188!16s%2Fg%2F1tvm39xl?authuser=0&hl=en&rclk=1",
'https://www.google.com/maps/place/Boesen+The+Florist/data=!4m6!3m5!1s0x87ee9c1c1c4cb587:0x1e0c4959fbf34f6a!8m2!3d41.6267652!4d-93.6767491!16s%2Fg%2F1td2vg3b?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Boesen+the+Florist/data=!4m6!3m5!1s0x87ee9ed6bee42e7d:0xcb33d46e89c3605a!8m2!3d41.5866929!4d-93.668475!16s%2Fg%2F1v0llbbj?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Nielsen+Flower+Shop+Inc./data=!4m6!3m5!1s0x87ee9e23839eda5d:0x25a5ca69824457d2!8m2!3d41.5960756!4d-93.7379209!16s%2Fg%2F1tf36j12?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Hy-Vee+Floral/data=!4m6!3m5!1s0x87ee9bfd29b3601d:0xf8a93939390d2233!8m2!3d41.625509!4d-93.652655!16s%2Fg%2F1tmqgrwk?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Wildflower/data=!4m6!3m5!1s0x87ee99ad677ff647:0x1d8781a36a2887a7!8m2!3d41.5854855!4d-93.6541196!16s%2Fg%2F11g1lnqnd2?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/The+Wild+Orchid/data=!4m6!3m5!1s0x87ee9945df550f53:0x370c6279e304dcba!8m2!3d41.5860602!4d-93.6505064!16s%2Fg%2F11h0ykpc9b?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Hy-Vee+Floral/data=!4m6!3m5!1s0x87eea29553b5443f:0xf6786dd336c55bc8!8m2!3d41.5252573!4d-93.6026348!16s%2Fg%2F1tk701jx?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Flowers+By+Anthony/data=!4m6!3m5!1s0x87ee9888be543f7f:0x2910a8c95a70426b!8m2!3d41.5544784!4d-93.6265733!16s%2Fg%2F1tg79lnh?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Vintage+Barn+Floral/data=!4m6!3m5!1s0x87eea267b43c01b5:0xc33f2bd2d21b1ea8!8m2!3d41.5262999!4d-93.6304368!16s%2Fg%2F11g9jj31y4?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Hy-Vee+Floral/data=!4m6!3m5!1s0x87eea20a5c2518ed:0x38621b6cb4ed0a81!8m2!3d41.5424474!4d-93.6430022!16s%2Fg%2F1yfjk2jr0?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/A+%26+T+Floral+Shop/data=!4m6!3m5!1s0x87eea2639a77abf5:0x456db35484bca0f5!8m2!3d41.5310947!4d-93.6260063!16s%2Fg%2F11gr3ftmp9?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Something+Chic+Floral/data=!4m6!3m5!1s0x87ee9e202ee42883:0x40ad7dac8108d5fc!8m2!3d41.5726407!4d-93.7337251!16s%2Fg%2F1trxkv5w?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Hy-Vee+Floral/data=!4m6!3m5!1s0x87ee90b2f1294c4d:0x64e20c5d311e4337!8m2!3d41.628712!4d-93.5698329!16s%2Fg%2F1tnjk3n4?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Flowerama+West+Des+Moines/data=!4m6!3m5!1s0x87ee9e41fb6edcb9:0x43a12dec9ec234b5!8m2!3d41.6006711!4d-93.7184221!16s%2Fg%2F1tltmy6h?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Adina+Blooms/data=!4m6!3m5!1s0x87ee99a99c40287f:0xa35456bc3228a415!8m2!3d41.626262!4d-93.6318535!16s%2Fg%2F11fr3vyl4d?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Hy-Vee+Floral/data=!4m6!3m5!1s0x87ee9fcf637d8843:0x86b1d654dc505e76!8m2!3d41.5709196!4d-93.7294863!16s%2Fg%2F12lkj773m?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Hy-Vee+Floral/data=!4m6!3m5!1s0x87ee980dfef845dd:0x37245f8a02a1948e!8m2!3d41.554758!4d-93.59495!16s%2Fg%2F1hm444xff?authuser=0&hl=en&rclk=1',
"https://www.google.com/maps/place/Sam's+Club+Floral/data=!4m6!3m5!1s0x87ee9ff3278fa543:0x3b9deb8224abde95!8m2!3d41.5983872!4d-93.7140387!16s%2Fg%2F11j0w3m1qw?authuser=0&hl=en&rclk=1"
]
I got 20 results
... other data
{
shop: 'Hy-Vee Grocery Store',
address: '2540 E Euclid Ave, Des Moines, IA 50317, United States',
website: 'https://www.hy-vee.com/stores/detail.aspx?s=48&utm_source=google&utm_medium=organic&utm_campaign=gmb-listing'
} ... other results
Alternatively, you can do it using Google Maps API from SerpApi. It's a paid API with a free plan.
The difference is that you don't have to figure out which selectors to use and then maintain the parser over time. All the information is immediately available to you, you just need to process the JSON file. Check out the playground.
Usage:
const SerpApi = require("google-search-results-nodejs");
const mySecret = process.env["API_KEY"]; // your API KEY from serpapi.com
const search = new SerpApi.GoogleSearch(mySecret);
const params = {
engine: "google_maps", // search engine
q: "flower shop des moines Iowa", // search query
google_domain: "google.com", // google domain of the search
ll: "@41.6238809, -93.9120425,10z", // GPS Coordinates parameter
type: "search", // type of search parameter
hl: "en", // language of the search
start: 0, //result offset parameter
};
const scrapedData = [];
const getData = function (data) {
const results = data.local_results;
results?.forEach((result) => {
const { title: shop = "No shop name provided", address = "Delivery service (No address)", website = "No website provided" } = result;
const info = {
shop,
address,
website,
};
scrapedData.push(info);
});
if (data.serpapi_pagination?.next) {
params.start += 20;
searchInfo();
} else {
console.log(scrapedData);
}
};
const searchInfo = () => {
search.json(params, getData);
};
searchInfo();
Output:
[
{
shop: 'Flowerama Des Moines',
address: '3310 SE 14th St, Des Moines, IA 50320',
website: 'http://www.1800flowersdesmoines.flowerama.com/'
},
{
shop: "Irene's Flowers",
address: '1151 25th St, Des Moines, IA 50311',
website: 'http://www.dsmflorist.com/'
},
{
shop: 'Boesen The Florist',
address: '3422 Beaver Ave, Des Moines, IA 50310',
website: 'http://www.boesen.com/'
},
...
]
Disclaimer, I work for SerpApi.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 |