Added max crawled places options

This commit is contained in:
JakubDrobnik 2018-12-10 15:52:40 +01:00
parent 0689be833c
commit e6612c9743
4 changed files with 22 additions and 10 deletions

View File

@ -18,6 +18,12 @@
"prefill": { "useApifyProxy": true }, "prefill": { "useApifyProxy": true },
"editor": "proxy" "editor": "proxy"
}, },
"maxCrawledPlaces": {
"title": "Max crawled places",
"type": "integer",
"description": "Use to limit places you want to get from crawler. If you fill 0 or nothing all places will be scrape",
"minimum": 0
},
"lat": { "lat": {
"title": "Viewport Latitude", "title": "Viewport Latitude",
"type": "string", "type": "string",

View File

@ -31,14 +31,14 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue) => {
/** /**
* Crawler add all place detail from listing to queue * Crawler add all place detail from listing to queue
* @param startUrl * @param page
* @param searchString * @param searchString
* @param launchPuppeteerOptions * @param launchPuppeteerOptions
* @param requestQueue * @param requestQueue
* @param listingPagination * @param listingPagination
* @param retries * @param maxRequestsPerCrawl
*/ */
const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteerOptions, requestQueue, listingPagination) => { const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, maxRequestsPerCrawl) => {
await page.type('#searchboxinput', searchString); await page.type('#searchboxinput', searchString);
await sleep(5000); await sleep(5000);
await page.click('#searchbox-searchbutton'); await page.click('#searchbox-searchbutton');
@ -77,7 +77,7 @@ const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteer
.attr('disabled'); .attr('disabled');
}, nextButtonSelector); }, nextButtonSelector);
const noResultsEl = await page.$('.section-no-result-title'); const noResultsEl = await page.$('.section-no-result-title');
if (isNextPaginationDisabled || noResultsEl) { if (isNextPaginationDisabled || noResultsEl || (maxRequestsPerCrawl && maxRequestsPerCrawl < to)) {
break; break;
} else { } else {
// NOTE: puppeteer API click() didn't work :( // NOTE: puppeteer API click() didn't work :(

View File

@ -3,7 +3,7 @@ const placesCrawler = require('./places_crawler');
Apify.main(async () => { Apify.main(async () => {
const input = await Apify.getValue('INPUT'); const input = await Apify.getValue('INPUT');
const { searchString, proxyConfig, lat, lng } = input; const { searchString, proxyConfig, lat, lng, maxCrawledPlaces } = input;
if (!searchString) throw new Error('Attribute searchString missing in input.'); if (!searchString) throw new Error('Attribute searchString missing in input.');
@ -26,7 +26,7 @@ Apify.main(async () => {
if (proxyConfig) Object.assign(launchPuppeteerOptions, proxyConfig); if (proxyConfig) Object.assign(launchPuppeteerOptions, proxyConfig);
// Scrape all place detail links // Scrape all place detail links
const crawler = placesCrawler.setUpCrawler(launchPuppeteerOptions, requestQueue); const crawler = placesCrawler.setUpCrawler(launchPuppeteerOptions, requestQueue, maxCrawledPlaces);
await crawler.run(); await crawler.run();
console.log('Done!'); console.log('Done!');

View File

@ -11,10 +11,11 @@ const enqueueAllPlaceDetailsCrawler = require('./enqueue_places_crawler');
* Method to set up crawler to get all place details and save them to default dataset * Method to set up crawler to get all place details and save them to default dataset
* @param launchPuppeteerOptions * @param launchPuppeteerOptions
* @param requestQueue * @param requestQueue
* @param maxCrawledPlaces
* @return {Apify.PuppeteerCrawler} * @return {Apify.PuppeteerCrawler}
*/ */
const setUpCrawler = (launchPuppeteerOptions, requestQueue) => { const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) => {
return new Apify.PuppeteerCrawler({ const crawlerOpts = {
launchPuppeteerOptions, launchPuppeteerOptions,
requestQueue, requestQueue,
maxRequestRetries: MAX_PAGE_RETRIES, maxRequestRetries: MAX_PAGE_RETRIES,
@ -22,6 +23,11 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue) => {
handlePageTimeoutSecs: 2 * 3600, // Two hours because startUrl crawler handlePageTimeoutSecs: 2 * 3600, // Two hours because startUrl crawler
maxOpenPagesPerInstance: 1, // Because startUrl crawler crashes if we mixed it with details scraping maxOpenPagesPerInstance: 1, // Because startUrl crawler crashes if we mixed it with details scraping
// maxConcurrency: 1, // maxConcurrency: 1,
};
if (maxCrawledPlaces) {
crawlerOpts.maxRequestsPerCrawl = maxCrawledPlaces + 1; // The first one is startUrl
}
return new Apify.PuppeteerCrawler(Object.assign(crawlerOpts, {
gotoFunction: async ({ request, page }) => { gotoFunction: async ({ request, page }) => {
await page._client.send('Emulation.clearDeviceMetricsOverride'); await page._client.send('Emulation.clearDeviceMetricsOverride');
await page.goto(request.url, { timeout: 60000 }); await page.goto(request.url, { timeout: 60000 });
@ -36,7 +42,7 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue) => {
// Store state of listing pagination // Store state of listing pagination
// NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination // NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination
const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {}; const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {};
await enqueueAllPlaceDetailsCrawler.run(page, searchString, launchPuppeteerOptions, requestQueue, listingPagination); await enqueueAllPlaceDetailsCrawler.run(page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, crawlerOpts.maxRequestsPerCrawl);
listingPagination.isFinish = true; listingPagination.isFinish = true;
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
} else { } else {
@ -113,7 +119,7 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue) => {
errors: request.errorMessages, errors: request.errorMessages,
}); });
}, },
}); }));
}; };
module.exports = { setUpCrawler }; module.exports = { setUpCrawler };