diff --git a/package.json b/package.json index 31eb5c6..7852ebb 100644 --- a/package.json +++ b/package.json @@ -4,7 +4,8 @@ "description": "", "main": "main.js", "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" + "test": "echo \"Error: no test specified\" && exit 1", + "start": "node ./src/main.js" }, "repository": { "type": "git", @@ -20,5 +21,8 @@ "bugs": { "url": "https://github.com/drobnikj/crawler-google-places/issues" }, - "homepage": "https://github.com/drobnikj/crawler-google-places#readme" + "homepage": "https://github.com/drobnikj/crawler-google-places#readme", + "dependencies": { + "apify": "^0.8.18" + } } diff --git a/src/main.js b/src/main.js index 06e7df9..c527ef7 100644 --- a/src/main.js +++ b/src/main.js @@ -1 +1,224 @@ -tets +/** + * Run the following example to perform a recursive crawl of a website using Puppeteer. + */ + +const Apify = require('apify'); + +const { sleep } = Apify.utils; +const { injectJQuery } = Apify.utils.puppeteer; + +const sleepPromised = ms => new Promise(resolve => setTimeout(resolve, ms)); + +const logError = (msg, e) => { + console.log(`ERROR: ${msg}`); + console.error(e); +}; +const logInfo = (msg) => console.log(`INFO: ${msg}`); +const logDebug = (msg) => console.log(`DEBUG: ${msg}`); + +/** + * Method scrolls page to xpos, ypos. + */ +const scrollTo = (page, xpos, ypos) => page.evaluate((x, y) => window.scrollTo(x, y), xpos, ypos); + +/** + * Method returns info about page scroll + */ +const getPageScrollInfo = page => page.evaluate(() => { + return { + scrollHeight: document.documentElement.scrollHeight, + scrollTop: document.documentElement.scrollTop, + clientHeight: document.documentElement.clientHeight, + }; +}); + +/** + * Scroll to down page until infinite scroll ends or reaches maxHeight + * @param page - instance of crawled page + * @param maxHeight - max height of document to scroll + * @return {Promise.} + */ +const infiniteScroll = async (page, maxHeight) => { + const maybeResourceTypesInfiniteScroll = ['xhr', 'fetch', 'websocket', 'other']; + const stringifyScrollInfo = (scrollInfo) => { + return `scrollTop=${scrollInfo.scrollTop}, ` + + `clientHeight=${scrollInfo.clientHeight}, ` + + `scrollHeight=${scrollInfo.scrollHeight}, ` + + `maxHeight=${maxHeight}`; + }; + const defaultScrollDelay = 500; + + // Catch and count all pages request for resources + const resourcesStats = { + requested: 0, + finished: 0, + failed: 0, + forgotten: 0, + }; + const pendingRequests = {}; + page.on('request', (msg) => { + if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType)) { + pendingRequests[msg._requestId] = Date.now(); + resourcesStats.requested++; + } + }); + page.on('requestfailed', (msg) => { + if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType)) { + if (pendingRequests[msg._requestId]) { + delete pendingRequests[msg._requestId]; + resourcesStats.failed++; + } + + } + }); + page.on('requestfinished', (msg) => { + if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType)) { + if (pendingRequests[msg._requestId]) { + delete pendingRequests[msg._requestId]; + resourcesStats.finished++; + } + } + }); + + try { + let scrollInfo = await getPageScrollInfo(page); + logInfo(`Infinite scroll started (${stringifyScrollInfo(scrollInfo)}).`); + + while (true) { + scrollInfo = await getPageScrollInfo(page); + + // Forget pending resources that didn't finish loading in time + const now = Date.now(); + const timeout = 30000; // TODO: use resourceTimeout + Object.keys(pendingRequests) + .forEach((requestId) => { + if (pendingRequests[requestId] + timeout < now) { + delete pendingRequests[requestId]; + resourcesStats.forgotten++; + } + }); + + logDebug(`Infinite scroll stats (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)}).`); + + const pendingRequestsCount = resourcesStats.requested - (resourcesStats.finished + resourcesStats.failed + resourcesStats.forgotten); + if (pendingRequestsCount === 0) { + // If the page is scrolled to the very bottom or beyond maximum height, we are done + if (scrollInfo.scrollTop + scrollInfo.clientHeight >= Math.min(scrollInfo.scrollHeight, maxHeight)) break; + // Otherwise we try to scroll down + await scrollTo(page, 0, scrollInfo.scrollHeight); + } + + await sleepPromised(defaultScrollDelay); + } + // Scroll back up, otherwise the screenshot of the browser would only show the bottom of + // the page + await scrollTo(page, 0, 0); + + logInfo(`Infinite scroll finished (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)})`); + } catch (err) { + logError('An exception thrown in infiniteScroll()', err); + } +}; + +const enqueueAllUrlsFromPagination = async (page, requestQueue) => { + const detailLinks = []; + let results = await page.$$('.section-result'); + const resultsCount = results.length; + console.log('Titles ', results.length); + for (let resultIndex = 0; resultIndex < resultsCount; resultIndex++) { + // Need to get results again, pupptr lost context.. + results = await page.$$('.section-result'); + const link = await results[resultIndex].$('h3'); + await link.click(); + await page.waitForSelector('.section-back-to-list-button'); + const url = page.url(); + console.log(url); + await requestQueue.addRequest({ url, userData: { label: 'detail' } }); + await page.click('.section-back-to-list-button'); + await sleep(5000); + } + return detailLinks; +}; + +Apify.main(async () => { + const { searchString } = await Apify.getValue('INPUT'); + + if (!searchString) throw new Error('Attribute searchString missing in input.') + + const requestQueue = await Apify.openRequestQueue(); + await requestQueue.addRequest({ url: 'https://www.google.com/maps/search/', userData: { label: 'startUrl' } }); + + const crawler = new Apify.PuppeteerCrawler({ + launchPuppeteerOptions: { + useApifyProxy: true, + }, + requestQueue, + handlePageFunction: async ({ request, page }) => { + const { label } = request.userData; + if (label === 'startUrl') { + // Enqueue all urls for place detail + await page.type('#searchboxinput', 'Česká Spořitelna'); + await sleep(5000); + await page.click('#searchbox-searchbutton'); + await sleep(5000); + while(true) { + await page.waitForSelector('#section-pagination-button-next'); + await enqueueAllUrlsFromPagination(page, requestQueue); + const nextButton = await page.$('#section-pagination-button-next'); + const isNextPagination = (await nextButton.getProperty('disabled') === 'true'); + console.log('isNextPagination ', isNextPagination); + if (isNextPagination) { + break; + } else { + await nextButton.click(); + } + await sleep(5000); + } + } else { + await injectJQuery(page); + await page.waitForSelector('h1.section-hero-header-title'); + // Get data from review + const placeDetail = await page.evaluate(() => { + return { + title: $('h1.section-hero-header-title').text().trim(), + totalScore: $('span.section-star-display').eq(0).text().trim(), + reviewsCount: $('button.section-reviewchart-numreviews').text().trim().match(/\d+/)[0], + categoryName: $('[jsaction="pane.rating.category"]').text().trim(), + address: $('[data-section-id="ad"] .widget-pane-link').text().trim(), + plusCode: $('[data-section-id="ol"] .widget-pane-link').text().trim(), + }; + }); + placeDetail.url = request.url; + placeDetail.reviews = []; + console.log(placeDetail) + // Get all reviews + await page.click('button.section-reviewchart-numreviews'); + await infiniteScroll(page, 99999999999); + const reviewEls = await page.$$('div.section-review'); + for (const reviewEl of reviewEls) { + const moreButton = await reviewEl.$('.section-expand-review'); + if (moreButton) { + await moreButton.click(); + sleep(1000); + } + const review = await page.evaluate((reviewEl) => { + const $review = $(reviewEl); + return { + name: $review.find('.section-review-title').text().trim(), + text: $review.find('section-review-text').text(), + stars: $review.find('.section-review-stars').attr('aria-label'), + publishAt: $review.find('.section-review-publish-date').text().trim(), + likesCount: $review.find('.section-review-thumbs-up-count').text().trim(), + }; + }, reviewEl); + console.log(review); + placeDetail.reviews.push(review); + } + await Apify.pushData(placeDetail); + } + }, + maxConcurrency: 1, + }); + + await crawler.run(); +});