diff --git a/src/infinite_scroll.js b/src/infinite_scroll.js new file mode 100644 index 0000000..3250b84 --- /dev/null +++ b/src/infinite_scroll.js @@ -0,0 +1,119 @@ +const Apify = require('apify'); + +const sleepPromised = ms => new Promise(resolve => setTimeout(resolve, ms)); + +const logError = (msg, e) => { + console.log(`ERROR: ${msg}`); + console.error(e); +}; +const logInfo = (msg) => console.log(`INFO: ${msg}`); +const logDebug = (msg) => console.log(`DEBUG: ${msg}`); + +/** + * Method scrolls page to xpos, ypos. + */ +const scrollTo = (page, elementToScroll, scrollToHeight) => page.evaluate((elementToScroll, scrollToHeight) => { + const scrollable = document.querySelector(elementToScroll); + scrollable.scrollTop = scrollToHeight; +}, elementToScroll, scrollToHeight); + +/** + * Method returns info about page scroll + */ +const getPageScrollInfo = (page, elementToScroll) => page.evaluate((elementToScroll) => { + return { + scrollHeight: document.querySelector(elementToScroll).scrollHeight, + scrollTop: document.querySelector(elementToScroll).scrollTop, + clientHeight: document.querySelector(elementToScroll).clientHeight, + }; +}, elementToScroll); + +/** + * Scroll to down page until infinite scroll ends or reaches maxHeight + * @param page - instance of crawled page + * @param maxHeight - max height of document to scrollscrollHeight + * @param elementToScroll - CSS selector of element where we want to scroll, default is 'body' + * @return {Promise.} + */ +module.exports = async (page, maxHeight, elementToScroll = 'body') => { + const maybeResourceTypesInfiniteScroll = ['xhr', 'fetch', 'websocket', 'other']; + const stringifyScrollInfo = (scrollInfo) => { + return `scrollTop=${scrollInfo.scrollTop}, ` + + `clientHeight=${scrollInfo.clientHeight}, ` + + `scrollHeight=${scrollInfo.scrollHeight}, ` + + `maxHeight=${maxHeight}`; + }; + const defaultScrollDelay = 2000; + + // Catch and count all pages request for resources + const resourcesStats = { + requested: 0, + finished: 0, + failed: 0, + forgotten: 0, + }; + const pendingRequests = {}; + page.on('request', (msg) => { + if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType)) { + pendingRequests[msg._requestId] = Date.now(); + resourcesStats.requested++; + } + }); + page.on('requestfailed', (msg) => { + if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType)) { + if (pendingRequests[msg._requestId]) { + delete pendingRequests[msg._requestId]; + resourcesStats.failed++; + } + + } + }); + page.on('requestfinished', (msg) => { + if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType)) { + if (pendingRequests[msg._requestId]) { + delete pendingRequests[msg._requestId]; + resourcesStats.finished++; + } + } + }); + + try { + await page.waitForSelector(elementToScroll); + let scrollInfo = await getPageScrollInfo(page, elementToScroll); + logInfo(`Infinite scroll started (${stringifyScrollInfo(scrollInfo)}).`); + + while (true) { + scrollInfo = await getPageScrollInfo(page, elementToScroll); + + // Forget pending resources that didn't finish loading in time + const now = Date.now(); + const timeout = 30000; // TODO: use resourceTimeout + Object.keys(pendingRequests) + .forEach((requestId) => { + if (pendingRequests[requestId] + timeout < now) { + delete pendingRequests[requestId]; + resourcesStats.forgotten++; + } + }); + + logDebug(`Infinite scroll stats (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)}).`); + + const pendingRequestsCount = resourcesStats.requested - (resourcesStats.finished + resourcesStats.failed + resourcesStats.forgotten); + if (pendingRequestsCount === 0) { + // If the page is scrolled to the very bottom or beyond maximum height, we are done + if (scrollInfo.scrollTop + scrollInfo.clientHeight >= Math.min(scrollInfo.scrollHeight, maxHeight)) break; + // Otherwise we try to scroll down + await scrollTo(page, elementToScroll, scrollInfo.scrollHeight); + } + + await sleepPromised(defaultScrollDelay); + } + // Scroll back up, otherwise the screenshot of the browser would only show the bottom of + // the page + await scrollTo(page, elementToScroll, scrollInfo.scrollHeight); + + logInfo(`Infinite scroll finished (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)})`); + } catch (err) { + logError('An exception thrown in infiniteScroll()', err); + } +}; diff --git a/src/main.js b/src/main.js index 9b9562a..a2be6fe 100644 --- a/src/main.js +++ b/src/main.js @@ -2,124 +2,13 @@ * Run the following example to perform a recursive crawl of a website using Puppeteer. */ const Apify = require('apify'); +const infiniteScroll = require('./infinite_scroll'); const { sleep } = Apify.utils; const { injectJQuery } = Apify.utils.puppeteer; const DEFAULT_TIMEOUT = 60 * 1000; // 60 sec - -const sleepPromised = ms => new Promise(resolve => setTimeout(resolve, ms)); - -const logError = (msg, e) => { - console.log(`ERROR: ${msg}`); - console.error(e); -}; -const logInfo = (msg) => console.log(`INFO: ${msg}`); -const logDebug = (msg) => console.log(`DEBUG: ${msg}`); - -/** - * Method scrolls page to xpos, ypos. - */ -const scrollTo = (page, xpos, ypos) => page.evaluate((x, y) => window.scrollTo(x, y), xpos, ypos); - -/** - * Method returns info about page scroll - */ -const getPageScrollInfo = page => page.evaluate(() => { - return { - scrollHeight: document.documentElement.scrollHeight, - scrollTop: document.documentElement.scrollTop, - clientHeight: document.documentElement.clientHeight, - }; -}); - -/** - * Scroll to down page until infinite scroll ends or reaches maxHeight - * @param page - instance of crawled page - * @param maxHeight - max height of document to scroll - * @return {Promise.} - */ -const infiniteScroll = async (page, maxHeight) => { - const maybeResourceTypesInfiniteScroll = ['xhr', 'fetch', 'websocket', 'other']; - const stringifyScrollInfo = (scrollInfo) => { - return `scrollTop=${scrollInfo.scrollTop}, ` + - `clientHeight=${scrollInfo.clientHeight}, ` + - `scrollHeight=${scrollInfo.scrollHeight}, ` + - `maxHeight=${maxHeight}`; - }; - const defaultScrollDelay = 500; - - // Catch and count all pages request for resources - const resourcesStats = { - requested: 0, - finished: 0, - failed: 0, - forgotten: 0, - }; - const pendingRequests = {}; - page.on('request', (msg) => { - if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType)) { - pendingRequests[msg._requestId] = Date.now(); - resourcesStats.requested++; - } - }); - page.on('requestfailed', (msg) => { - if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType)) { - if (pendingRequests[msg._requestId]) { - delete pendingRequests[msg._requestId]; - resourcesStats.failed++; - } - - } - }); - page.on('requestfinished', (msg) => { - if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType)) { - if (pendingRequests[msg._requestId]) { - delete pendingRequests[msg._requestId]; - resourcesStats.finished++; - } - } - }); - - try { - let scrollInfo = await getPageScrollInfo(page); - logInfo(`Infinite scroll started (${stringifyScrollInfo(scrollInfo)}).`); - - while (true) { - scrollInfo = await getPageScrollInfo(page); - - // Forget pending resources that didn't finish loading in time - const now = Date.now(); - const timeout = 30000; // TODO: use resourceTimeout - Object.keys(pendingRequests) - .forEach((requestId) => { - if (pendingRequests[requestId] + timeout < now) { - delete pendingRequests[requestId]; - resourcesStats.forgotten++; - } - }); - - logDebug(`Infinite scroll stats (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)}).`); - - const pendingRequestsCount = resourcesStats.requested - (resourcesStats.finished + resourcesStats.failed + resourcesStats.forgotten); - if (pendingRequestsCount === 0) { - // If the page is scrolled to the very bottom or beyond maximum height, we are done - if (scrollInfo.scrollTop + scrollInfo.clientHeight >= Math.min(scrollInfo.scrollHeight, maxHeight)) break; - // Otherwise we try to scroll down - await scrollTo(page, 0, scrollInfo.scrollHeight); - } - - await sleepPromised(defaultScrollDelay); - } - // Scroll back up, otherwise the screenshot of the browser would only show the bottom of - // the page - await scrollTo(page, 0, 0); - - logInfo(`Infinite scroll finished (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)})`); - } catch (err) { - logError('An exception thrown in infiniteScroll()', err); - } -}; +const LISTING_PAGINATION_KEY = 'listingState'; const enqueueAllUrlsFromPagination = async (page, requestQueue) => { const detailLinks = []; @@ -148,7 +37,7 @@ Apify.main(async () => { let startUrl; if (searchViewport) { - const { lat, lng, zoom = 10 } = searchViewport + const { lat, lng, zoom = 10 } = searchViewport; if (!lat || !lng) throw new Error('You have to defined lat and lng for searchViewport!'); startUrl = `https://www.google.com/maps/@${lat},${lng},${zoom}z/search`; } else { @@ -160,12 +49,17 @@ Apify.main(async () => { const requestQueue = await Apify.openRequestQueue(); await requestQueue.addRequest({ url: startUrl, userData: { label: 'startUrl' } }); + // Store state of listing pagination + // NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination + let listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {}; + const crawler = new Apify.PuppeteerCrawler({ + maxOpenPagesPerInstance: 1, // NOTE: Ensure that we rotate IP after each request launchPuppeteerOptions: { useApifyProxy: true, useChrome: true, apifyProxyGroups: ['CZECH_LUMINATI'], - liveView: Apify.isAtHome(), + // liveView: Apify.isAtHome(), }, requestQueue, handlePageTimeoutSecs: 1800, // We are adding all links to queue on startUrl @@ -180,10 +74,19 @@ Apify.main(async () => { await page.click('#searchbox-searchbutton'); await sleep(5000); while(true) { - const paginationText = await page.$eval('.section-pagination-right', el => el.innerText); - console.log(`Added links from pagination: ${paginationText}`); await page.waitForSelector('#section-pagination-button-next', { timeout: DEFAULT_TIMEOUT }); - await enqueueAllUrlsFromPagination(page, requestQueue); + const paginationText = await page.$eval('.section-pagination-right', el => el.innerText); + const [fromString , toString] = paginationText.match(/\d+/g); + const from = parseInt(fromString); + const to = parseInt(toString); + if (listingPagination.to && to <= listingPagination.to) { + console.log(`Skiped pagination ${from} - ${to}, already done!`); + } else { + console.log(`Added links from pagination ${from} - ${to}`); + await enqueueAllUrlsFromPagination(page, requestQueue); + } + listingPagination = { from, to }; + await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); const nextButton = await page.$('#section-pagination-button-next'); const isNextPaginationDisabled = (await nextButton.getProperty('disabled') === 'true'); if (isNextPaginationDisabled) { @@ -201,7 +104,6 @@ Apify.main(async () => { return { title: $('h1.section-hero-header-title').text().trim(), totalScore: $('span.section-star-display').eq(0).text().trim(), - reviewsCount: $('button.section-reviewchart-numreviews').text().trim().match(/\d+/)[0], categoryName: $('[jsaction="pane.rating.category"]').text().trim(), address: $('[data-section-id="ad"] .widget-pane-link').text().trim(), plusCode: $('[data-section-id="ol"] .widget-pane-link').text().trim(), @@ -209,32 +111,39 @@ Apify.main(async () => { }); placeDetail.url = request.url; placeDetail.reviews = []; - console.log(placeDetail); - // Get all reviews - await page.click('button.section-reviewchart-numreviews'); - await infiniteScroll(page, 99999999999); - const reviewEls = await page.$$('div.section-review'); - for (const reviewEl of reviewEls) { - const moreButton = await reviewEl.$('.section-expand-review'); - if (moreButton) { - await moreButton.click(); - sleep(1000); + if (placeDetail.totalScore) { + placeDetail.reviewsCount = await page.evaluate(() => { + const numberReviewsText = $('button.section-reviewchart-numreviews').text().trim() + return (numberReviewsText) ? numberReviewsText.match(/\d+/)[0] : null; + }); + // Get all reviews + await page.click('button.section-reviewchart-numreviews'); + await page.waitForSelector('.section-star-display'); + await infiniteScroll(page, 99999999999, '.section-scrollbox'); + sleep(2000); + const reviewEls = await page.$$('div.section-review'); + for (const reviewEl of reviewEls) { + const moreButton = await reviewEl.$('.section-expand-review'); + if (moreButton) { + await moreButton.click(); + sleep(1000); + } + const review = await page.evaluate((reviewEl) => { + const $review = $(reviewEl); + return { + name: $review.find('.section-review-title').text().trim(), + text: $review.find('.section-review-text').text(), + stars: $review.find('.section-review-stars').attr('aria-label').trim(), + publishAt: $review.find('.section-review-publish-date').text().trim(), + likesCount: $review.find('.section-review-thumbs-up-count').text().trim(), + }; + }, reviewEl); + placeDetail.reviews.push(review); } - const review = await page.evaluate((reviewEl) => { - const $review = $(reviewEl); - return { - name: $review.find('.section-review-title').text().trim(), - text: $review.find('.section-review-text').text(), - stars: $review.find('.section-review-stars').attr('aria-label').trim(), - publishAt: $review.find('.section-review-publish-date').text().trim(), - likesCount: $review.find('.section-review-thumbs-up-count').text().trim(), - }; - }, reviewEl); - placeDetail.reviews.push(review); } await Apify.pushData(placeDetail); } - console.log('Done ', request.url); + console.log(request.url, 'Done'); }, });