From 0689be833cdebd4f9145132ed10d5a0fe16fece0 Mon Sep 17 00:00:00 2001 From: JakubDrobnik Date: Mon, 10 Dec 2018 15:27:34 +0100 Subject: [PATCH] Moved crawler for enqueue places to main crawler --- apify.json | 11 +-- src/enqueue_places_crawler.js | 115 +++++++++++++----------------- src/main.js | 14 +--- src/places_crawler.js | 129 +++++++++++++++++++--------------- 4 files changed, 127 insertions(+), 142 deletions(-) diff --git a/apify.json b/apify.json index 13ccc19..6bfc184 100644 --- a/apify.json +++ b/apify.json @@ -1,11 +1,6 @@ { "name": "crawler-google-places", - "actId": null, - "version": { - "versionNumber": "0.1", - "buildTag": "latest", - "envVars": [], - "sourceType": "TARBALL", - "tarballUrl": null - } + "version": "0.1", + "buildTag": "latest", + "env": null } diff --git a/src/enqueue_places_crawler.js b/src/enqueue_places_crawler.js index b477747..5559776 100644 --- a/src/enqueue_places_crawler.js +++ b/src/enqueue_places_crawler.js @@ -1,9 +1,11 @@ const Apify = require('apify'); -const { sleep } = Apify.utils; -const { injectJQuery } = Apify.utils.puppeteer; -const { MAX_PAGE_RETRIES, DEFAULT_TIMEOUT, LISTING_PAGINATION_KEY } = require('./consts'); -const waitForGoogleMapLoader = (page) => page.waitFor(() => !document.querySelector('#searchbox').classList.contains('loading'), { timeout: DEFAULT_TIMEOUT }); +const { sleep } = Apify.utils; +const { DEFAULT_TIMEOUT, LISTING_PAGINATION_KEY } = require('./consts'); + +const waitForGoogleMapLoader = (page) => page.waitFor(() => !document.querySelector('#searchbox') + .classList + .contains('loading'), { timeout: DEFAULT_TIMEOUT }); const enqueueAllUrlsFromPagination = async (page, requestQueue) => { let results = await page.$$('.section-result'); @@ -36,70 +38,53 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue) => { * @param listingPagination * @param retries */ -const enqueueAllPlaceDetailsCrawler = async (startUrl, searchString, launchPuppeteerOptions, requestQueue, listingPagination, retries = 0) => { - let browser; +const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteerOptions, requestQueue, listingPagination) => { + await page.type('#searchboxinput', searchString); + await sleep(5000); + await page.click('#searchbox-searchbutton'); + await sleep(5000); + await waitForGoogleMapLoader(page); + // In case there is no listing, put just detail page to queue try { - browser = await Apify.launchPuppeteer(launchPuppeteerOptions); - const page = await browser.newPage(); - await page._client.send('Emulation.clearDeviceMetricsOverride'); - await page.goto(startUrl); - await injectJQuery(page); - await page.type('#searchboxinput', searchString); - await sleep(5000); - await page.click('#searchbox-searchbutton'); - await sleep(5000); - await waitForGoogleMapLoader(page); - // In case there is no listing, put just detail page to queue - try { - await page.waitForSelector('h1.section-hero-header-title'); - } catch (e) { - // It can happen, doesn't matter + await page.waitForSelector('h1.section-hero-header-title'); + } catch (e) { + // It can happen, doesn't matter + } + const maybeDetailPlace = await page.$('h1.section-hero-header-title'); + if (maybeDetailPlace) { + const url = page.url(); + await requestQueue.addRequest({ url, userData: { label: 'detail' } }); + return; + } + const nextButtonSelector = '[jsaction="pane.paginationSection.nextPage"]'; + while (true) { + await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT }); + const paginationText = await page.$eval('.n7lv7yjyC35__right', (el) => el.innerText); + const [fromString, toString] = paginationText.match(/\d+/g); + const from = parseInt(fromString); + const to = parseInt(toString); + if (listingPagination.from && from <= listingPagination.from) { + console.log(`Skiped pagination ${from} - ${to}, already done!`); + } else { + console.log(`Added links from pagination ${from} - ${to}`); + await enqueueAllUrlsFromPagination(page, requestQueue); + listingPagination = { from, to }; + await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); } - const maybeDetailPlace = await page.$('h1.section-hero-header-title'); - if (maybeDetailPlace) { - const url = page.url(); - await requestQueue.addRequest({ url, userData: { label: 'detail' } }); - return; + await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT }); + const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => { + return !!$(nextButtonSelector) + .attr('disabled'); + }, nextButtonSelector); + const noResultsEl = await page.$('.section-no-result-title'); + if (isNextPaginationDisabled || noResultsEl) { + break; + } else { + // NOTE: puppeteer API click() didn't work :( + await page.evaluate((sel) => $(sel) + .click(), nextButtonSelector); + await waitForGoogleMapLoader(page); } - const nextButtonSelector = '#section-pagination-button-next'; - while (true) { - await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT }); - const paginationText = await page.$eval('.section-pagination-right', (el) => el.innerText); - const [fromString, toString] = paginationText.match(/\d+/g); - const from = parseInt(fromString); - const to = parseInt(toString); - if (listingPagination.from && from <= listingPagination.from) { - console.log(`Skiped pagination ${from} - ${to}, already done!`); - } else { - console.log(`Added links from pagination ${from} - ${to}`); - await enqueueAllUrlsFromPagination(page, requestQueue); - listingPagination = { from, to }; - await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); - } - await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT }); - const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => { - return !!$(nextButtonSelector).attr('disabled'); - }, nextButtonSelector); - const noResultsEl = await page.$('.section-no-result-title'); - if (isNextPaginationDisabled || noResultsEl) { - break; - } else { - // NOTE: puppeteer API click() didn't work :( - await page.evaluate((sel) => $(sel).click(), nextButtonSelector); - await waitForGoogleMapLoader(page); - } - } - } catch (err) { - if (retries < MAX_PAGE_RETRIES) { - ++retries; - console.log(`Retiring enqueueAllPlaceDetails for ${retries} time, error:`); - console.error(err); - await browser.close(); - await enqueueAllPlaceDetailsCrawler(startUrl, searchString, launchPuppeteerOptions, requestQueue, listingPagination, ++retries); - } - throw err; - } finally { - if (browser) await browser.close(); } }; diff --git a/src/main.js b/src/main.js index 4c7eb3d..ea1002a 100644 --- a/src/main.js +++ b/src/main.js @@ -1,7 +1,5 @@ const Apify = require('apify'); const placesCrawler = require('./places_crawler'); -const enqueueAllPlaceDetailsCrawler = require('./enqueue_places_crawler'); -const { LISTING_PAGINATION_KEY } = require('./consts'); Apify.main(async () => { const input = await Apify.getValue('INPUT'); @@ -22,21 +20,11 @@ Apify.main(async () => { console.log('Start url is', startUrl); const requestQueue = await Apify.openRequestQueue(); + await requestQueue.addRequest({ url: startUrl, userData: { label: 'startUrl', searchString } }); - // Store state of listing pagination - // NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination - const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {}; const launchPuppeteerOptions = {}; if (proxyConfig) Object.assign(launchPuppeteerOptions, proxyConfig); - // Enqueue all links to scrape from listings - if (!listingPagination.isFinish) { - console.log(`Start enqueuing place details for search: ${searchString}`); - await enqueueAllPlaceDetailsCrawler.run(startUrl, searchString, launchPuppeteerOptions, requestQueue, listingPagination); - listingPagination.isFinish = true; - await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); - } - // Scrape all place detail links const crawler = placesCrawler.setUpCrawler(launchPuppeteerOptions, requestQueue); await crawler.run(); diff --git a/src/places_crawler.js b/src/places_crawler.js index 6f83ad8..20e64e3 100644 --- a/src/places_crawler.js +++ b/src/places_crawler.js @@ -4,7 +4,8 @@ const { sleep } = Apify.utils; const infiniteScroll = require('./infinite_scroll'); const { injectJQuery } = Apify.utils.puppeteer; -const { MAX_PAGE_RETRIES, DEFAULT_TIMEOUT } = require('./consts'); +const { MAX_PAGE_RETRIES, DEFAULT_TIMEOUT, LISTING_PAGINATION_KEY } = require('./consts'); +const enqueueAllPlaceDetailsCrawler = require('./enqueue_places_crawler'); /** * Method to set up crawler to get all place details and save them to default dataset @@ -18,73 +19,89 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue) => { requestQueue, maxRequestRetries: MAX_PAGE_RETRIES, retireInstanceAfterRequestCount: 10, - handlePageTimeoutSecs: 600, + handlePageTimeoutSecs: 2 * 3600, // Two hours because startUrl crawler + maxOpenPagesPerInstance: 1, // Because startUrl crawler crashes if we mixed it with details scraping // maxConcurrency: 1, gotoFunction: async ({ request, page }) => { await page._client.send('Emulation.clearDeviceMetricsOverride'); await page.goto(request.url, { timeout: 60000 }); }, handlePageFunction: async ({ request, page }) => { - const { label } = request.userData; + const { label, searchString } = request.userData; console.log(`Open ${request.url} with label: ${label}`); - // Get data from review await injectJQuery(page); - await page.waitForSelector('h1.section-hero-header-title', { timeout: DEFAULT_TIMEOUT }); - const placeDetail = await page.evaluate(() => { - return { - title: $('h1.section-hero-header-title').text().trim(), - totalScore: $('span.section-star-display').eq(0).text().trim(), - categoryName: $('[jsaction="pane.rating.category"]').text().trim(), - address: $('[data-section-id="ad"] .widget-pane-link').text().trim(), - plusCode: $('[data-section-id="ol"] .widget-pane-link').text().trim(), - }; - }); - placeDetail.url = request.url; - placeDetail.reviews = []; - if (placeDetail.totalScore) { - placeDetail.reviewsCount = await page.evaluate(() => { - const numberReviewsText = $('button.section-reviewchart-numreviews').text().trim(); - return (numberReviewsText) ? numberReviewsText.match(/\d+/)[0] : null; + if (label === 'startUrl') { + // enqueue all places + console.log(`Start enqueuing place details for search: ${searchString}`); + // Store state of listing pagination + // NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination + const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {}; + await enqueueAllPlaceDetailsCrawler.run(page, searchString, launchPuppeteerOptions, requestQueue, listingPagination); + listingPagination.isFinish = true; + await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); + } else { + // Timeout because timeout for handle page is 2 hours + setTimeout(() => { + throw new Error('HandlePagefunction timed out!'); + }, 600000); + // Get data from review + await page.waitForSelector('h1.section-hero-header-title', { timeout: DEFAULT_TIMEOUT }); + const placeDetail = await page.evaluate(() => { + return { + title: $('h1.section-hero-header-title').text().trim(), + totalScore: $('span.section-star-display').eq(0).text().trim(), + categoryName: $('[jsaction="pane.rating.category"]').text().trim(), + address: $('[data-section-id="ad"] .widget-pane-link').text().trim(), + plusCode: $('[data-section-id="ol"] .widget-pane-link').text().trim(), + }; }); - // Get all reviews - await page.click('button.section-reviewchart-numreviews'); - await page.waitForSelector('.section-star-display', { timeout: DEFAULT_TIMEOUT }); - await sleep(5000); - // Sort reviews by newest, one click sometimes didn't work :) - await page.click('.section-tab-info-stats-button-flex'); - await sleep(1000); - await page.click('.section-tab-info-stats-button-flex'); - await sleep(1000); - await page.click('.section-tab-info-stats-button-flex'); - await sleep(5000); - await page.click('.context-menu-entry[data-index="1"]'); - await infiniteScroll(page, 99999999999, '.section-scrollbox.section-listbox'); - const reviewEls = await page.$$('div.section-review'); - for (const reviewEl of reviewEls) { - const moreButton = await reviewEl.$('.section-expand-review'); - if (moreButton) { - await moreButton.click(); - await sleep(2000); - } - const review = await page.evaluate((reviewEl) => { - const $review = $(reviewEl); - const reviewData = { - name: $review.find('.section-review-title').text().trim(), - text: $review.find('.section-review-review-content .section-review-text').text(), - stars: $review.find('.section-review-stars').attr('aria-label').trim(), - publishAt: $review.find('.section-review-publish-date').text().trim(), - likesCount: $review.find('.section-review-thumbs-up-count').text().trim(), - }; - const $response = $review.find('.section-review-owner-response'); - if ($response) { - reviewData.responseFromOwnerText = $response.find('.section-review-text').text().trim(); + placeDetail.url = request.url; + placeDetail.reviews = []; + if (placeDetail.totalScore) { + placeDetail.reviewsCount = await page.evaluate(() => { + const numberReviewsText = $('button.section-reviewchart-numreviews').text().trim(); + return (numberReviewsText) ? numberReviewsText.match(/\d+/)[0] : null; + }); + // Get all reviews + await page.click('button.section-reviewchart-numreviews'); + await page.waitForSelector('.section-star-display', { timeout: DEFAULT_TIMEOUT }); + await sleep(5000); + // Sort reviews by newest, one click sometimes didn't work :) + await page.click('.section-tab-info-stats-button-flex'); + await sleep(1000); + await page.click('.section-tab-info-stats-button-flex'); + await sleep(1000); + await page.click('.section-tab-info-stats-button-flex'); + await sleep(5000); + await page.click('.context-menu-entry[data-index="1"]'); + await infiniteScroll(page, 99999999999, '.section-scrollbox.section-listbox'); + const reviewEls = await page.$$('div.section-review'); + for (const reviewEl of reviewEls) { + const moreButton = await reviewEl.$('.section-expand-review'); + if (moreButton) { + await moreButton.click(); + await sleep(2000); } - return reviewData; - }, reviewEl); - placeDetail.reviews.push(review); + const review = await page.evaluate((reviewEl) => { + const $review = $(reviewEl); + const reviewData = { + name: $review.find('.section-review-title').text().trim(), + text: $review.find('.section-review-review-content .section-review-text').text(), + stars: $review.find('.section-review-stars').attr('aria-label').trim(), + publishAt: $review.find('.section-review-publish-date').text().trim(), + likesCount: $review.find('.section-review-thumbs-up-count').text().trim(), + }; + const $response = $review.find('.section-review-owner-response'); + if ($response) { + reviewData.responseFromOwnerText = $response.find('.section-review-text').text().trim(); + } + return reviewData; + }, reviewEl); + placeDetail.reviews.push(review); + } } + await Apify.pushData(placeDetail); } - await Apify.pushData(placeDetail); console.log(request.url, 'Done'); },