diff --git a/src/consts.js b/src/consts.js new file mode 100644 index 0000000..08bbdf6 --- /dev/null +++ b/src/consts.js @@ -0,0 +1,6 @@ +// NOTE: This is not nice, it waits for implementing default timeout into puppeteer. +exports.DEFAULT_TIMEOUT = 60 * 1000; // 60 sec + +exports.LISTING_PAGINATION_KEY = 'listingState'; +exports.MAX_PAGE_RETRIES = 5; + diff --git a/src/enqueue_places_crawler.js b/src/enqueue_places_crawler.js new file mode 100644 index 0000000..8ca87e1 --- /dev/null +++ b/src/enqueue_places_crawler.js @@ -0,0 +1,101 @@ +const Apify = require('apify'); +const { sleep } = Apify.utils; +const { injectJQuery } = Apify.utils.puppeteer; +const { MAX_PAGE_RETRIES, DEFAULT_TIMEOUT, LISTING_PAGINATION_KEY } = require('./consts'); + +const waitForGoogleMapLoader = (page) => page.waitFor(() => !document.querySelector('#searchbox').classList.contains('loading'), { timeout: DEFAULT_TIMEOUT }); + +const enqueueAllUrlsFromPagination = async (page, requestQueue) => { + let results = await page.$$('.section-result'); + const resultsCount = results.length; + for (let resultIndex = 0; resultIndex < resultsCount; resultIndex++) { + // Need to get results again, pupptr lost context.. + await page.waitForSelector('.searchbox', { timeout: DEFAULT_TIMEOUT }); + await waitForGoogleMapLoader(page); + await page.waitFor((resultIndex) => { + return document.querySelectorAll('.section-result h3').length >= resultIndex + 1; + }, { timeout: DEFAULT_TIMEOUT }, resultIndex); + results = await page.$$('.section-result'); + const link = await results[resultIndex].$('h3'); + await link.click(); + await waitForGoogleMapLoader(page); + await page.waitForSelector('.section-back-to-list-button', { timeout: DEFAULT_TIMEOUT }); + const url = page.url(); + await requestQueue.addRequest({ url, userData: { label: 'detail' } }); + console.log(`Added to queue ${url}`); + await page.click('.section-back-to-list-button'); + } +}; + +/** + * Crawler add all place detail from listing to queue + * @param startUrl + * @param searchString + * @param launchPuppeteerOptions + * @param requestQueue + * @param listingPagination + * @param retries + */ +const enqueueAllPlaceDetailsCrawler = async (startUrl, searchString, launchPuppeteerOptions, requestQueue, listingPagination, retries = 0) => { + let browser; + try { + browser = await Apify.launchPuppeteer(launchPuppeteerOptions); + const page = await browser.newPage(); + await page._client.send('Emulation.clearDeviceMetricsOverride'); + await page.goto(startUrl); + await injectJQuery(page); + await page.type('#searchboxinput', searchString); + await sleep(5000); + await page.click('#searchbox-searchbutton'); + await sleep(5000); + await waitForGoogleMapLoader(page); + // In case there is no listing, put just detail page to queue + const maybeDetailPlace = await page.$('h1.section-hero-header-title'); + if (maybeDetailPlace) { + const url = page.url(); + await requestQueue.addRequest({ url, userData: { label: 'detail' } }); + return; + } + const nextButtonSelector = '#section-pagination-button-next'; + while (true) { + await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT }); + const paginationText = await page.$eval('.section-pagination-right', (el) => el.innerText); + const [fromString, toString] = paginationText.match(/\d+/g); + const from = parseInt(fromString); + const to = parseInt(toString); + if (listingPagination.from && from <= listingPagination.from) { + console.log(`Skiped pagination ${from} - ${to}, already done!`); + } else { + console.log(`Added links from pagination ${from} - ${to}`); + await enqueueAllUrlsFromPagination(page, requestQueue); + listingPagination = { from, to }; + await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); + } + await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT }); + const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => { + return !!$(nextButtonSelector).attr('disabled'); + }, nextButtonSelector); + const noResultsEl = await page.$('.section-no-result-title'); + if (isNextPaginationDisabled || noResultsEl) { + break; + } else { + // NOTE: puppeteer API click() didn't work :( + await page.evaluate((sel) => $(sel).click(), nextButtonSelector); + await waitForGoogleMapLoader(page); + } + } + } catch (err) { + if (retries < MAX_PAGE_RETRIES) { + ++retries; + console.log(`Retiring enqueueAllPlaceDetails for ${retries} time, error:`); + console.error(err); + await browser.close(); + await enqueueAllPlaceDetailsCrawler(startUrl, searchString, launchPuppeteerOptions, requestQueue, listingPagination, ++retries); + } + throw err; + } finally { + if (browser) await browser.close(); + } +}; + +module.exports = { run: enqueueAllPlaceDetailsCrawler }; diff --git a/src/infinite_scroll.js b/src/infinite_scroll.js index cbd3edd..2eeb850 100644 --- a/src/infinite_scroll.js +++ b/src/infinite_scroll.js @@ -28,7 +28,7 @@ const getPageScrollInfo = (page, elementToScroll) => page.evaluate((elementToScr /** * Scroll to down page until infinite scroll ends or reaches maxHeight * @param page - instance of crawled page - * @param maxHeight - max height of document to scrollscrollHeight + * @param maxHeight - max height of document to scrollHeight * @param elementToScroll - CSS selector of element where we want to scroll, default is 'body' * @return {Promise.} */ @@ -96,30 +96,29 @@ module.exports = async (page, maxHeight, elementToScroll = 'body') => { const pendingRequestsCount = resourcesStats.requested - (resourcesStats.finished + resourcesStats.failed + resourcesStats.forgotten); + // We have to wait if all xhrs are finished if (pendingRequestsCount === 0) { - // If the page is scrolled to the very bottom or beyond maximum height, we are done const isLoaderOnPage = await page.evaluate(() => { const loader = $('.section-loading-spinner'); - if (loader) { - return loader.parent().attr('style') !== 'display: none;'; - } + if (loader) return loader.parent().attr('style') !== 'display: none;'; }); + const reviewsCount = await page.evaluate(() => $('div.section-review').length); - // console.log(reviewsCount, previosReviewsCount, isLoaderOnPage); + /** + * If the page is scrolled to the very bottom or beyond + * maximum height and loader is not displayed and we don't find new reviews, we are done. + */ if (reviewsCount === previosReviewsCount && (scrollInfo.scrollTop + scrollInfo.clientHeight >= Math.min(scrollInfo.scrollHeight, maxHeight)) && !isLoaderOnPage ) break; previosReviewsCount = reviewsCount; + // Otherwise we try to scroll down await scrollTo(page, elementToScroll, maxHeight); } await sleep(defaultScrollDelay); } - // Scroll back up, otherwise the screenshot of the browser would only show the bottom of - // the page - await scrollTo(page, elementToScroll, maxHeight); - logInfo(`Infinite scroll finished (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)})`); } catch (err) { logError('An exception thrown in infiniteScroll()', err); diff --git a/src/main.js b/src/main.js index d81d489..4c7eb3d 100644 --- a/src/main.js +++ b/src/main.js @@ -1,96 +1,7 @@ -/** - * Run the following example to perform a recursive crawl of a website using Puppeteer. - */ const Apify = require('apify'); -const infiniteScroll = require('./infinite_scroll'); - -const { sleep } = Apify.utils; -const { injectJQuery } = Apify.utils.puppeteer; - -// NOTE: This is not nice, it waits for implementing default timeout into puppeteer. -const DEFAULT_TIMEOUT = 60 * 1000; // 60 sec - -const LISTING_PAGINATION_KEY = 'listingState'; -const MAX_PAGE_RETRIES = 5; - -const waitForGoogleMapLoader = (page) => page.waitFor(() => !document.querySelector('#searchbox').classList.contains('loading'), { timeout: DEFAULT_TIMEOUT }); - -const enqueueAllUrlsFromPagination = async (page, requestQueue) => { - const detailLinks = []; - let results = await page.$$('.section-result'); - const resultsCount = results.length; - for (let resultIndex = 0; resultIndex < resultsCount; resultIndex++) { - // Need to get results again, pupptr lost context.. - await page.waitForSelector('.searchbox', { timeout: DEFAULT_TIMEOUT }); - await waitForGoogleMapLoader(page); - await page.waitFor((resultIndex) => { - return document.querySelectorAll('.section-result h3').length >= resultIndex + 1; - }, { timeout: DEFAULT_TIMEOUT }, resultIndex); - results = await page.$$('.section-result'); - const link = await results[resultIndex].$('h3'); - await link.click(); - await waitForGoogleMapLoader(page); - await page.waitForSelector('.section-back-to-list-button', { timeout: DEFAULT_TIMEOUT }); - const url = page.url(); - await requestQueue.addRequest({ url, userData: { label: 'detail' } }); - console.log(`Added to queue ${url}`); - await page.click('.section-back-to-list-button'); - } - return detailLinks; -}; - -const enqueueAllPlaceDetails = async (startUrl, searchString, launchPuppeteerOptions, requestQueue, listingPagination, retries = 0) => { - let browser; - try { - browser = await Apify.launchPuppeteer(launchPuppeteerOptions); - const page = await browser.newPage(); - await page._client.send('Emulation.clearDeviceMetricsOverride'); - await page.goto(startUrl); - await injectJQuery(page); - await page.type('#searchboxinput', searchString); - await sleep(5000); - await page.click('#searchbox-searchbutton'); - await sleep(5000); - const nextButtonSelector = '#section-pagination-button-next'; - while (true) { - await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT }); - const paginationText = await page.$eval('.section-pagination-right', (el) => el.innerText); - const [fromString, toString] = paginationText.match(/\d+/g); - const from = parseInt(fromString); - const to = parseInt(toString); - if (listingPagination.from && from <= listingPagination.from) { - console.log(`Skiped pagination ${from} - ${to}, already done!`); - } else { - console.log(`Added links from pagination ${from} - ${to}`); - await enqueueAllUrlsFromPagination(page, requestQueue); - listingPagination = { from, to }; - await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); - } - await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT }); - const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => { - return !!$(nextButtonSelector).attr('disabled'); - }, nextButtonSelector); - const noResultsEl = await page.$('.section-no-result-title'); - if (isNextPaginationDisabled || noResultsEl) { - break; - } else { - // NOTE: puppeteer API click() didn't work :( - await page.evaluate((sel) => $(sel).click(), nextButtonSelector); - await waitForGoogleMapLoader(page); - } - } - } catch (err) { - if (retries < MAX_PAGE_RETRIES) { - ++retries; - console.log(`Retiring enqueueAllPlaceDetails for ${retries} time.`); - await browser.close(); - await enqueueAllPlaceDetails(startUrl, searchString, launchPuppeteerOptions, requestQueue, listingPagination, ++retries); - } - throw err; - } finally { - if (browser) await browser.close(); - } -}; +const placesCrawler = require('./places_crawler'); +const enqueueAllPlaceDetailsCrawler = require('./enqueue_places_crawler'); +const { LISTING_PAGINATION_KEY } = require('./consts'); Apify.main(async () => { const input = await Apify.getValue('INPUT'); @@ -110,100 +21,25 @@ Apify.main(async () => { } console.log('Start url is', startUrl); - const requestQueue = await Apify.openRequestQueue(); // Store state of listing pagination // NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {}; - - const launchPuppeteerOptions = { - // useApifyProxy: true, - // useChrome: true, - // apifyProxyGroups: ['CZECH_LUMINATI'], - // liveView: Apify.isAtHome(), - }; + const launchPuppeteerOptions = {}; if (proxyConfig) Object.assign(launchPuppeteerOptions, proxyConfig); // Enqueue all links to scrape from listings if (!listingPagination.isFinish) { console.log(`Start enqueuing place details for search: ${searchString}`); - await enqueueAllPlaceDetails(startUrl, searchString, launchPuppeteerOptions, requestQueue, listingPagination); + await enqueueAllPlaceDetailsCrawler.run(startUrl, searchString, launchPuppeteerOptions, requestQueue, listingPagination); listingPagination.isFinish = true; await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); } // Scrape all place detail links - const crawler = new Apify.PuppeteerCrawler({ - launchPuppeteerOptions, - requestQueue, - maxRequestRetries: MAX_PAGE_RETRIES, - retireInstanceAfterRequestCount: 10, - handlePageTimeoutSecs: 600, - gotoFunction: async ({ request, page }) => { - await page._client.send('Emulation.clearDeviceMetricsOverride'); - await page.goto(request.url, { timeout: 60000 }); - }, - handlePageFunction: async ({ request, page }) => { - const { label } = request.userData; - console.log(`Open ${request.url} with label: ${label}`); - // Get data from review - await injectJQuery(page); - await page.waitForSelector('h1.section-hero-header-title', { timeout: DEFAULT_TIMEOUT }); - const placeDetail = await page.evaluate(() => { - return { - title: $('h1.section-hero-header-title').text().trim(), - totalScore: $('span.section-star-display').eq(0).text().trim(), - categoryName: $('[jsaction="pane.rating.category"]').text().trim(), - address: $('[data-section-id="ad"] .widget-pane-link').text().trim(), - plusCode: $('[data-section-id="ol"] .widget-pane-link').text().trim(), - }; - }); - placeDetail.url = request.url; - placeDetail.reviews = []; - if (placeDetail.totalScore) { - placeDetail.reviewsCount = await page.evaluate(() => { - const numberReviewsText = $('button.section-reviewchart-numreviews').text().trim(); - return (numberReviewsText) ? numberReviewsText.match(/\d+/)[0] : null; - }); - // Get all reviews - await page.click('button.section-reviewchart-numreviews'); - await page.waitForSelector('.section-star-display', { timeout: DEFAULT_TIMEOUT }); - await infiniteScroll(page, 99999999999, '.section-scrollbox.section-listbox'); - await sleep(2000); - const reviewEls = await page.$$('div.section-review'); - for (const reviewEl of reviewEls) { - const moreButton = await reviewEl.$('.section-expand-review'); - if (moreButton) { - await moreButton.click(); - await sleep(1000); - } - const review = await page.evaluate((reviewEl) => { - const $review = $(reviewEl); - return { - name: $review.find('.section-review-title').text().trim(), - text: $review.find('.section-review-text').text(), - stars: $review.find('.section-review-stars').attr('aria-label').trim(), - publishAt: $review.find('.section-review-publish-date').text().trim(), - likesCount: $review.find('.section-review-thumbs-up-count').text().trim(), - }; - }, reviewEl); - placeDetail.reviews.push(review); - } - } - await Apify.pushData(placeDetail); - - console.log(request.url, 'Done'); - }, - handleFailedRequestFunction: async ({ request }) => { - // This function is called when crawling of a request failed too many time - await Apify.pushData({ - url: request.url, - succeeded: false, - errors: request.errorMessages, - }); - }, - }); - + const crawler = placesCrawler.setUpCrawler(launchPuppeteerOptions, requestQueue); await crawler.run(); + + console.log('Done!'); }); diff --git a/src/places_crawler.js b/src/places_crawler.js new file mode 100644 index 0000000..dadc3ca --- /dev/null +++ b/src/places_crawler.js @@ -0,0 +1,91 @@ +const Apify = require('apify'); +const { sleep } = Apify.utils; +const infiniteScroll = require('./infinite_scroll'); +const { injectJQuery } = Apify.utils.puppeteer; +const { MAX_PAGE_RETRIES, DEFAULT_TIMEOUT } = require('./consts'); + +/** + * Method to set up crawler to get all place details and save them to default dataset + * @param launchPuppeteerOptions + * @param requestQueue + * @return {Apify.PuppeteerCrawler} + */ +const setUpCrawler = (launchPuppeteerOptions, requestQueue) => { + return new Apify.PuppeteerCrawler({ + launchPuppeteerOptions, + requestQueue, + maxRequestRetries: MAX_PAGE_RETRIES, + retireInstanceAfterRequestCount: 10, + handlePageTimeoutSecs: 600, + maxConcurrency: 1, + gotoFunction: async ({ request, page }) => { + await page._client.send('Emulation.clearDeviceMetricsOverride'); + await page.goto(request.url, { timeout: 60000 }); + }, + handlePageFunction: async ({ request, page }) => { + const { label } = request.userData; + console.log(`Open ${request.url} with label: ${label}`); + // Get data from review + await injectJQuery(page); + await page.waitForSelector('h1.section-hero-header-title', { timeout: DEFAULT_TIMEOUT }); + const placeDetail = await page.evaluate(() => { + return { + title: $('h1.section-hero-header-title').text().trim(), + totalScore: $('span.section-star-display').eq(0).text().trim(), + categoryName: $('[jsaction="pane.rating.category"]').text().trim(), + address: $('[data-section-id="ad"] .widget-pane-link').text().trim(), + plusCode: $('[data-section-id="ol"] .widget-pane-link').text().trim(), + }; + }); + placeDetail.url = request.url; + placeDetail.reviews = []; + if (placeDetail.totalScore) { + placeDetail.reviewsCount = await page.evaluate(() => { + const numberReviewsText = $('button.section-reviewchart-numreviews').text().trim(); + return (numberReviewsText) ? numberReviewsText.match(/\d+/)[0] : null; + }); + // Get all reviews + await page.click('button.section-reviewchart-numreviews'); + await page.waitForSelector('.section-star-display', { timeout: DEFAULT_TIMEOUT }); + await sleep(2000); + // Sort reviews by newest + await page.click('.section-tab-info-stats-button-flex'); + await page.waitForSelector('.context-menu-entry[data-index="1"]'); + await page.click('.context-menu-entry[data-index="1"]'); + await infiniteScroll(page, 99999999999, '.section-scrollbox.section-listbox'); + const reviewEls = await page.$$('div.section-review'); + for (const reviewEl of reviewEls) { + const moreButton = await reviewEl.$('.section-expand-review'); + if (moreButton) { + await moreButton.click(); + await sleep(1000); + } + const review = await page.evaluate((reviewEl) => { + const $review = $(reviewEl); + return { + name: $review.find('.section-review-title').text().trim(), + text: $review.find('.section-review-text').text(), + stars: $review.find('.section-review-stars').attr('aria-label').trim(), + publishAt: $review.find('.section-review-publish-date').text().trim(), + likesCount: $review.find('.section-review-thumbs-up-count').text().trim(), + }; + }, reviewEl); + placeDetail.reviews.push(review); + } + } + await Apify.pushData(placeDetail); + + console.log(request.url, 'Done'); + }, + handleFailedRequestFunction: async ({ request }) => { + // This function is called when crawling of a request failed too many time + await Apify.pushData({ + url: request.url, + succeeded: false, + errors: request.errorMessages, + }); + }, + }); +}; + +module.exports = { setUpCrawler };