mirror of
https://github.com/davidjohnbarton/crawler-google-places.git
synced 2025-12-12 08:28:46 +00:00
119 lines
4.7 KiB
JavaScript
119 lines
4.7 KiB
JavaScript
const Apify = require('apify');
|
|
|
|
const sleepPromised = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
|
|
|
|
const logError = (msg, e) => {
|
|
console.log(`ERROR: ${msg}`);
|
|
console.error(e);
|
|
};
|
|
const logInfo = (msg) => console.log(`INFO: ${msg}`);
|
|
const logDebug = (msg) => console.log(`DEBUG: ${msg}`);
|
|
|
|
/**
|
|
* Method scrolls page to xpos, ypos.
|
|
*/
|
|
const scrollTo = (page, elementToScroll, scrollToHeight) => page.evaluate((elementToScroll, scrollToHeight) => {
|
|
const scrollable = document.querySelector(elementToScroll);
|
|
scrollable.scrollTop = scrollToHeight;
|
|
}, elementToScroll, scrollToHeight);
|
|
|
|
/**
|
|
* Method returns info about page scroll
|
|
*/
|
|
const getPageScrollInfo = (page, elementToScroll) => page.evaluate((elementToScroll) => {
|
|
return {
|
|
scrollHeight: document.querySelector(elementToScroll).scrollHeight,
|
|
scrollTop: document.querySelector(elementToScroll).scrollTop,
|
|
clientHeight: document.querySelector(elementToScroll).clientHeight,
|
|
};
|
|
}, elementToScroll);
|
|
|
|
/**
|
|
* Scroll to down page until infinite scroll ends or reaches maxHeight
|
|
* @param page - instance of crawled page
|
|
* @param maxHeight - max height of document to scrollscrollHeight
|
|
* @param elementToScroll - CSS selector of element where we want to scroll, default is 'body'
|
|
* @return {Promise.<void>}
|
|
*/
|
|
module.exports = async (page, maxHeight, elementToScroll = 'body') => {
|
|
const maybeResourceTypesInfiniteScroll = ['xhr', 'fetch', 'websocket', 'other'];
|
|
const stringifyScrollInfo = (scrollInfo) => {
|
|
return `scrollTop=${scrollInfo.scrollTop}, ` +
|
|
`clientHeight=${scrollInfo.clientHeight}, ` +
|
|
`scrollHeight=${scrollInfo.scrollHeight}, ` +
|
|
`maxHeight=${maxHeight}`;
|
|
};
|
|
const defaultScrollDelay = 2000;
|
|
|
|
// Catch and count all pages request for resources
|
|
const resourcesStats = {
|
|
requested: 0,
|
|
finished: 0,
|
|
failed: 0,
|
|
forgotten: 0,
|
|
};
|
|
const pendingRequests = {};
|
|
page.on('request', (msg) => {
|
|
if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType)) {
|
|
pendingRequests[msg._requestId] = Date.now();
|
|
resourcesStats.requested++;
|
|
}
|
|
});
|
|
page.on('requestfailed', (msg) => {
|
|
if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType)) {
|
|
if (pendingRequests[msg._requestId]) {
|
|
delete pendingRequests[msg._requestId];
|
|
resourcesStats.failed++;
|
|
}
|
|
}
|
|
});
|
|
page.on('requestfinished', (msg) => {
|
|
if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType)) {
|
|
if (pendingRequests[msg._requestId]) {
|
|
delete pendingRequests[msg._requestId];
|
|
resourcesStats.finished++;
|
|
}
|
|
}
|
|
});
|
|
|
|
try {
|
|
await page.waitForSelector(elementToScroll);
|
|
let scrollInfo = await getPageScrollInfo(page, elementToScroll);
|
|
logInfo(`Infinite scroll started (${stringifyScrollInfo(scrollInfo)}).`);
|
|
|
|
while (true) {
|
|
scrollInfo = await getPageScrollInfo(page, elementToScroll);
|
|
|
|
// Forget pending resources that didn't finish loading in time
|
|
const now = Date.now();
|
|
const timeout = 30000; // TODO: use resourceTimeout
|
|
Object.keys(pendingRequests)
|
|
.forEach((requestId) => {
|
|
if (pendingRequests[requestId] + timeout < now) {
|
|
delete pendingRequests[requestId];
|
|
resourcesStats.forgotten++;
|
|
}
|
|
});
|
|
|
|
logDebug(`Infinite scroll stats (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)}).`);
|
|
|
|
const pendingRequestsCount = resourcesStats.requested - (resourcesStats.finished + resourcesStats.failed + resourcesStats.forgotten);
|
|
if (pendingRequestsCount === 0) {
|
|
// If the page is scrolled to the very bottom or beyond maximum height, we are done
|
|
if (scrollInfo.scrollTop + scrollInfo.clientHeight >= Math.min(scrollInfo.scrollHeight, maxHeight)) break;
|
|
// Otherwise we try to scroll down
|
|
await scrollTo(page, elementToScroll, scrollInfo.scrollHeight);
|
|
}
|
|
|
|
await sleepPromised(defaultScrollDelay);
|
|
}
|
|
// Scroll back up, otherwise the screenshot of the browser would only show the bottom of
|
|
// the page
|
|
await scrollTo(page, elementToScroll, scrollInfo.scrollHeight);
|
|
|
|
logInfo(`Infinite scroll finished (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)})`);
|
|
} catch (err) {
|
|
logError('An exception thrown in infiniteScroll()', err);
|
|
}
|
|
};
|