mirror of
https://github.com/davidjohnbarton/crawler-google-places.git
synced 2025-12-12 16:38:45 +00:00
106 lines
4.7 KiB
JavaScript
106 lines
4.7 KiB
JavaScript
const Apify = require('apify');
|
|
|
|
const { sleep, log } = Apify.utils;
|
|
const { DEFAULT_TIMEOUT, LISTING_PAGINATION_KEY } = require('./consts');
|
|
|
|
const waitForGoogleMapLoader = (page) => page.waitFor(() => !document.querySelector('#searchbox')
|
|
.classList.contains('loading'), { timeout: DEFAULT_TIMEOUT });
|
|
|
|
const enqueueAllUrlsFromPagination = async (page, requestQueue, paginationFrom, maxPlacesPerCrawl) => {
|
|
let results = await page.$$('.section-result');
|
|
const resultsCount = results.length;
|
|
|
|
for (let resultIndex = 0; resultIndex < resultsCount; resultIndex++) {
|
|
// Need to get results again, pupptr lost context..
|
|
await page.waitForSelector('.searchbox', { timeout: DEFAULT_TIMEOUT });
|
|
await waitForGoogleMapLoader(page);
|
|
await page.waitFor((resultIndex) => {
|
|
return document.querySelectorAll('.section-result h3').length >= resultIndex + 1;
|
|
}, { timeout: DEFAULT_TIMEOUT }, resultIndex);
|
|
results = await page.$$('.section-result');
|
|
const link = await results[resultIndex].$('h3');
|
|
await link.click();
|
|
await waitForGoogleMapLoader(page);
|
|
await page.waitForSelector('.section-back-to-list-button', { timeout: DEFAULT_TIMEOUT });
|
|
// After redirection to detail page, save the URL to Request queue to process it later
|
|
const url = page.url();
|
|
await requestQueue.addRequest({ url, userData: { label: 'detail' } });
|
|
log.info(`Added to queue ${url}`);
|
|
if (maxPlacesPerCrawl && paginationFrom + resultIndex + 1 > maxPlacesPerCrawl) {
|
|
log.info(`Reach max places per crawl ${maxPlacesPerCrawl}, stopped enqueuing new places.`);
|
|
return true;
|
|
}
|
|
|
|
await page.click('.section-back-to-list-button');
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Method adds places from listing to queue
|
|
* @param page
|
|
* @param searchString
|
|
* @param requestQueue
|
|
* @param maxPlacesPerCrawl
|
|
*/
|
|
const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlacesPerCrawl) => {
|
|
// Save state of listing pagination
|
|
// NOTE: If pageFunction failed crawler skipped already scraped pagination
|
|
const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {};
|
|
|
|
await page.type('#searchboxinput', searchString);
|
|
await sleep(5000);
|
|
await page.click('#searchbox-searchbutton');
|
|
await sleep(5000);
|
|
await waitForGoogleMapLoader(page);
|
|
try {
|
|
await page.waitForSelector('h1.section-hero-header-title');
|
|
} catch (e) {
|
|
// It can happen if there is list of details.
|
|
}
|
|
|
|
// In case there is not list of details, it enqueues just detail page
|
|
const maybeDetailPlace = await page.$('h1.section-hero-header-title');
|
|
if (maybeDetailPlace) {
|
|
const url = page.url();
|
|
await requestQueue.addRequest({ url, userData: { label: 'detail' } });
|
|
return;
|
|
}
|
|
|
|
// In case there is a list of details, it goes through details, limits by maxPlacesPerCrawl
|
|
const nextButtonSelector = '[jsaction="pane.paginationSection.nextPage"]';
|
|
let isFinished;
|
|
while (true) {
|
|
await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
|
|
const paginationText = await page.$eval('.n7lv7yjyC35__right', (el) => el.innerText);
|
|
const [fromString, toString] = paginationText.match(/\d+/g);
|
|
const from = parseInt(fromString);
|
|
const to = parseInt(toString);
|
|
if (listingPagination.from && from <= listingPagination.from) {
|
|
log.debug(`Skiped pagination ${from} - ${to}, already done!`);
|
|
} else {
|
|
log.debug(`Added links from pagination ${from} - ${to}`);
|
|
isFinished = await enqueueAllUrlsFromPagination(page, requestQueue, from, maxPlacesPerCrawl);
|
|
listingPagination.from = from;
|
|
listingPagination.to = to;
|
|
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
|
|
}
|
|
if (!isFinished) await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
|
|
const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => {
|
|
return !!$(nextButtonSelector).attr('disabled');
|
|
}, nextButtonSelector);
|
|
const noResultsEl = await page.$('.section-no-result-title');
|
|
if (isNextPaginationDisabled || noResultsEl || (maxPlacesPerCrawl && maxPlacesPerCrawl <= to) || isFinished) {
|
|
break;
|
|
} else {
|
|
// NOTE: puppeteer API click() didn't work :|
|
|
await page.evaluate((sel) => $(sel).click(), nextButtonSelector);
|
|
await waitForGoogleMapLoader(page);
|
|
}
|
|
}
|
|
|
|
listingPagination.isFinish = true;
|
|
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
|
|
};
|
|
|
|
module.exports = { enqueueAllPlaceDetails };
|