Fixed infinite scroll

This commit is contained in:
drobnikj 2019-01-08 13:01:31 +01:00
parent 84b4040a0b
commit 3b87790d07

View File

@ -35,10 +35,10 @@ const getPageScrollInfo = (page, elementToScroll) => page.evaluate((elementToScr
module.exports = async (page, maxHeight, elementToScroll = 'body') => { module.exports = async (page, maxHeight, elementToScroll = 'body') => {
const maybeResourceTypesInfiniteScroll = ['xhr', 'fetch', 'websocket', 'other']; const maybeResourceTypesInfiniteScroll = ['xhr', 'fetch', 'websocket', 'other'];
const stringifyScrollInfo = (scrollInfo) => { const stringifyScrollInfo = (scrollInfo) => {
return `scrollTop=${scrollInfo.scrollTop}, ` + return `scrollTop=${scrollInfo.scrollTop}, `
`clientHeight=${scrollInfo.clientHeight}, ` + + `clientHeight=${scrollInfo.clientHeight}, `
`scrollHeight=${scrollInfo.scrollHeight}, ` + + `scrollHeight=${scrollInfo.scrollHeight}, `
`maxHeight=${maxHeight}`; + `maxHeight=${maxHeight}`;
}; };
const defaultScrollDelay = 3000; const defaultScrollDelay = 3000;
const defaultElementTimeout = 60000; const defaultElementTimeout = 60000;
@ -72,57 +72,52 @@ module.exports = async (page, maxHeight, elementToScroll = 'body') => {
} }
}); });
try { await page.waitForSelector(elementToScroll, { timeout: defaultElementTimeout });
await page.waitForSelector(elementToScroll, { timeout: defaultElementTimeout }); let scrollInfo = await getPageScrollInfo(page, elementToScroll);
let scrollInfo = await getPageScrollInfo(page, elementToScroll); logInfo(`Infinite scroll started (${stringifyScrollInfo(scrollInfo)}).`);
logInfo(`Infinite scroll started (${stringifyScrollInfo(scrollInfo)}).`);
let previosReviewsCount = 0; let previosReviewsCount = 0;
while (true) { while (true) {
scrollInfo = await getPageScrollInfo(page, elementToScroll); scrollInfo = await getPageScrollInfo(page, elementToScroll);
// Forget pending resources that didn't finish loading in time // Forget pending resources that didn't finish loading in time
const now = Date.now(); const now = Date.now();
const timeout = 30000; // TODO: use resourceTimeout const timeout = 30000; // TODO: use resourceTimeout
Object.keys(pendingRequests) Object.keys(pendingRequests)
.forEach((requestId) => { .forEach((requestId) => {
if (pendingRequests[requestId] + timeout < now) { if (pendingRequests[requestId] + timeout < now) {
delete pendingRequests[requestId]; delete pendingRequests[requestId];
resourcesStats.forgotten++; resourcesStats.forgotten++;
} }
}); });
logDebug(`Infinite scroll stats (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)}).`); logDebug(`Infinite scroll stats (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)}).`);
const pendingRequestsCount = resourcesStats.requested - (resourcesStats.finished + resourcesStats.failed + resourcesStats.forgotten); const pendingRequestsCount = resourcesStats.requested - (resourcesStats.finished + resourcesStats.failed + resourcesStats.forgotten);
// We have to wait if all xhrs are finished // We have to wait if all xhrs are finished
if (pendingRequestsCount === 0) { if (pendingRequestsCount === 0) {
const isLoaderOnPage = await page.evaluate(() => { const isLoaderOnPage = await page.evaluate(() => {
const loader = $('.section-loading-spinner'); const loader = $('.section-loading-spinner');
if (loader) return loader.parent().attr('style') !== 'display: none;'; if (loader) return loader.parent().attr('style') !== 'display: none;';
}); });
const reviewsCount = await page.evaluate(() => $('div.section-review').length); const reviewsCount = await page.evaluate(() => $('div.section-review').length);
/** /**
* If the page is scrolled to the very bottom or beyond * If the page is scrolled to the very bottom or beyond
* maximum height and loader is not displayed and we don't find new reviews, we are done. * maximum height and loader is not displayed and we don't find new reviews, we are done.
*/ */
if (reviewsCount === previosReviewsCount if (reviewsCount === previosReviewsCount
&& (scrollInfo.scrollTop + scrollInfo.clientHeight >= Math.min(scrollInfo.scrollHeight, maxHeight)) && (scrollInfo.scrollTop + scrollInfo.clientHeight >= Math.min(scrollInfo.scrollHeight, maxHeight))
&& !isLoaderOnPage && !isLoaderOnPage
) break; ) break;
previosReviewsCount = reviewsCount; previosReviewsCount = reviewsCount;
// Otherwise we try to scroll down // Otherwise we try to scroll down
await scrollTo(page, elementToScroll, maxHeight); await scrollTo(page, elementToScroll, maxHeight);
}
await sleep(defaultScrollDelay);
} }
page.removeAllListeners('request'); await sleep(defaultScrollDelay);
logInfo(`Infinite scroll finished (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)})`);
} catch (err) {
// Infinite scroll should not break whole crawler
logError('An exception thrown in infiniteScroll()', err);
} }
page.removeAllListeners('request');
logInfo(`Infinite scroll finished (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)})`);
}; };