From e6612c9743b056821792d854ef0c9fb992adb843 Mon Sep 17 00:00:00 2001 From: JakubDrobnik Date: Mon, 10 Dec 2018 15:52:40 +0100 Subject: [PATCH] Added max crawled places options --- INPUT_SCHEMA.json | 6 ++++++ src/enqueue_places_crawler.js | 8 ++++---- src/main.js | 4 ++-- src/places_crawler.js | 14 ++++++++++---- 4 files changed, 22 insertions(+), 10 deletions(-) diff --git a/INPUT_SCHEMA.json b/INPUT_SCHEMA.json index 398e153..050fb31 100644 --- a/INPUT_SCHEMA.json +++ b/INPUT_SCHEMA.json @@ -18,6 +18,12 @@ "prefill": { "useApifyProxy": true }, "editor": "proxy" }, + "maxCrawledPlaces": { + "title": "Max crawled places", + "type": "integer", + "description": "Use to limit places you want to get from crawler. If you fill 0 or nothing all places will be scrape", + "minimum": 0 + }, "lat": { "title": "Viewport Latitude", "type": "string", diff --git a/src/enqueue_places_crawler.js b/src/enqueue_places_crawler.js index 5559776..577dc0a 100644 --- a/src/enqueue_places_crawler.js +++ b/src/enqueue_places_crawler.js @@ -31,14 +31,14 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue) => { /** * Crawler add all place detail from listing to queue - * @param startUrl + * @param page * @param searchString * @param launchPuppeteerOptions * @param requestQueue * @param listingPagination - * @param retries + * @param maxRequestsPerCrawl */ -const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteerOptions, requestQueue, listingPagination) => { +const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, maxRequestsPerCrawl) => { await page.type('#searchboxinput', searchString); await sleep(5000); await page.click('#searchbox-searchbutton'); @@ -77,7 +77,7 @@ const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteer .attr('disabled'); }, nextButtonSelector); const noResultsEl = await page.$('.section-no-result-title'); - if (isNextPaginationDisabled || noResultsEl) { + if (isNextPaginationDisabled || noResultsEl || (maxRequestsPerCrawl && maxRequestsPerCrawl < to)) { break; } else { // NOTE: puppeteer API click() didn't work :( diff --git a/src/main.js b/src/main.js index ea1002a..3425f10 100644 --- a/src/main.js +++ b/src/main.js @@ -3,7 +3,7 @@ const placesCrawler = require('./places_crawler'); Apify.main(async () => { const input = await Apify.getValue('INPUT'); - const { searchString, proxyConfig, lat, lng } = input; + const { searchString, proxyConfig, lat, lng, maxCrawledPlaces } = input; if (!searchString) throw new Error('Attribute searchString missing in input.'); @@ -26,7 +26,7 @@ Apify.main(async () => { if (proxyConfig) Object.assign(launchPuppeteerOptions, proxyConfig); // Scrape all place detail links - const crawler = placesCrawler.setUpCrawler(launchPuppeteerOptions, requestQueue); + const crawler = placesCrawler.setUpCrawler(launchPuppeteerOptions, requestQueue, maxCrawledPlaces); await crawler.run(); console.log('Done!'); diff --git a/src/places_crawler.js b/src/places_crawler.js index 20e64e3..a53e7bb 100644 --- a/src/places_crawler.js +++ b/src/places_crawler.js @@ -11,10 +11,11 @@ const enqueueAllPlaceDetailsCrawler = require('./enqueue_places_crawler'); * Method to set up crawler to get all place details and save them to default dataset * @param launchPuppeteerOptions * @param requestQueue + * @param maxCrawledPlaces * @return {Apify.PuppeteerCrawler} */ -const setUpCrawler = (launchPuppeteerOptions, requestQueue) => { - return new Apify.PuppeteerCrawler({ +const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) => { + const crawlerOpts = { launchPuppeteerOptions, requestQueue, maxRequestRetries: MAX_PAGE_RETRIES, @@ -22,6 +23,11 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue) => { handlePageTimeoutSecs: 2 * 3600, // Two hours because startUrl crawler maxOpenPagesPerInstance: 1, // Because startUrl crawler crashes if we mixed it with details scraping // maxConcurrency: 1, + }; + if (maxCrawledPlaces) { + crawlerOpts.maxRequestsPerCrawl = maxCrawledPlaces + 1; // The first one is startUrl + } + return new Apify.PuppeteerCrawler(Object.assign(crawlerOpts, { gotoFunction: async ({ request, page }) => { await page._client.send('Emulation.clearDeviceMetricsOverride'); await page.goto(request.url, { timeout: 60000 }); @@ -36,7 +42,7 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue) => { // Store state of listing pagination // NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {}; - await enqueueAllPlaceDetailsCrawler.run(page, searchString, launchPuppeteerOptions, requestQueue, listingPagination); + await enqueueAllPlaceDetailsCrawler.run(page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, crawlerOpts.maxRequestsPerCrawl); listingPagination.isFinish = true; await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); } else { @@ -113,7 +119,7 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue) => { errors: request.errorMessages, }); }, - }); + })); }; module.exports = { setUpCrawler };